In [None]:
pip install fastparquet

In [None]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/Tabular-Playground-Nov-2022/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

## Reading data-files
submission = pd.read_parquet(file_content_stream_1)
df = pd.read_parquet('s3://analytics-data-science-competitions/Tabular-Playground-Series/Tabular-Playground-Nov-2022/preds_concat_gzip.parquet', engine = 'fastparquet')

In [None]:
preds_df = df.clip(0, 1) ## Notice that some of the model prediction files have negative likehookds and greater than 1 likelihoods
train = preds_df[preds_df['target'].notnull()]
test = preds_df[preds_df['target'].isnull()] 

In [None]:
from sklearn.metrics import log_loss

train_new = train.copy()
logloss = list()

for i in range(0, 5000):
    
    logloss.append(log_loss(train_new['target'], train_new.iloc[:, i]))
    
#  Log-Loss dataframe
logloss_data = pd.DataFrame({'File': train_new.columns[:-1], 'LogLoss': logloss})
logloss_data = logloss_data.sort_values(by = 'LogLoss').reset_index(drop = True)
logloss_data.head(20)

In [None]:
logloss_data.to_csv('logloss_data.csv', index = False)

In [None]:
X = train[logloss_data['File'][0:100].values]
Y = train['target']

test_new = test[logloss_data['File'][0:100].values]

## Defining list to store results
logit_results, test_preds_logit = list(), list()

fold = 1
kfold = StratifiedKFold(n_splits = 5, shuffle = True)
        
for train_ix, test_ix in kfold.split(X, Y):
    
    ## Splitting the data 
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]

    ## Building model
    logit_md = LogisticRegression(solver = 'liblinear', penalty = 'l1').fit(X_train, Y_train)
        
    ## Predicting on test
    logit_pred = logit_md.predict_proba(X_test)[:, 1]
    score = log_loss(Y_test, logit_pred)
    logit_results.append(score)
        
    print('Fold ', str(fold), ' result is:', score, '\n')

    test_preds_logit.append(logit_md.predict_proba(test_new)[:, 1])
    fold +=1

print('The average log-loss over 5-fold CV is', np.mean(logit_results))

In [None]:
test_preds_logit = pd.DataFrame(test_preds_logit)
print(test_preds_logit.shape)

test_preds_logit = test_preds_logit.mean(axis = 0)
print(test_preds_logit.head(5))

In [None]:
submission['pred'] = test_preds_logit
submission.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

logistic_performance = pd.DataFrame({'Number_of_Features': [100, 200, 300, 400, 500], 'CV_score': [0.5314294293449413, 0.5303333592114292, 0.5306822864009136, 0.5301723355083536, 0.5306497920414268], 'LB_score': [0.52438, 0.5239, 0.52357, 0.52319, 0.52313]})
logistic_performance['CV_score'] = round(logistic_performance['CV_score'], 5)
logistic_performance

In [None]:
plt.figure(figsize = (10, 8))

sns.lineplot(data = logistic_performance, x = 'Number_of_Features', y = 'CV_score', label = 'CV-score').set(xlabel = 'Number of Features', ylabel = 'Log-Loss Score', title = 'Logistic Regression Performance')
sns.lineplot(data = logistic_performance, x = 'Number_of_Features', y = 'LB_score', label = 'LB-score')

plt.savefig('logistic_performace.png')
plt.show();

In [None]:
sns.lineplot(data = logistic_performance.drop(columns = 'Number_of_Features', axis = 1))

In [11]:
import pandas as pd
import numpy as np

logloss_data = pd.read_csv('logloss_data.csv')

data_temp = logloss_data.iloc[0:10, ]
data_temp

Unnamed: 0,File,LogLoss
0,0.6222863195.csv,0.622286
1,0.6223807245.csv,0.622381
2,0.6225426578.csv,0.622543
3,0.6247722291.csv,0.624772
4,0.6253455681.csv,0.625346
5,0.6254850917.csv,0.625485
6,0.6255093621.csv,0.625509
7,0.6260141578.csv,0.626014
8,0.6263493693.csv,0.626349
9,0.6272779211.csv,0.627278


In [13]:
data_temp['w'] = 1 / data_temp['LogLoss']
data_temp['W'] = data_temp['w'] / np.sum(data_temp['w'])
data_temp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_temp['w'] = 1 / data_temp['LogLoss']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_temp['W'] = data_temp['w'] / np.sum(data_temp['w'])


Unnamed: 0,File,LogLoss,w,W
0,0.6222863195.csv,0.622286,1.606977,0.100403
1,0.6223807245.csv,0.622381,1.606734,0.100387
2,0.6225426578.csv,0.622543,1.606316,0.100361
3,0.6247722291.csv,0.624772,1.600583,0.100003
4,0.6253455681.csv,0.625346,1.599116,0.099911
5,0.6254850917.csv,0.625485,1.598759,0.099889
6,0.6255093621.csv,0.625509,1.598697,0.099885
7,0.6260141578.csv,0.626014,1.597408,0.099805
8,0.6263493693.csv,0.626349,1.596553,0.099751
9,0.6272779211.csv,0.627278,1.59419,0.099604
