In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import scipy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)

In [4]:
y = train['target']
train.drop(columns=['target'], inplace=True)

In [5]:
mapper_ord_1 = {'Novice': 1, 
                'Contributor': 2,
                'Expert': 3, 
                'Master': 4, 
                'Grandmaster': 5}

mapper_ord_2 = {'Freezing': 1, 
                'Cold': 2, 
                'Warm': 3, 
                'Hot': 4,
                'Boiling Hot': 5, 
                'Lava Hot': 6}

mapper_ord_3 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 
                'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15}

mapper_ord_4 = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 
                'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15,
                'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 
                'W': 23, 'X': 24, 'Y': 25, 'Z': 26}

for col, mapper in zip(['ord_1', 'ord_2', 'ord_3', 'ord_4'], [mapper_ord_1, mapper_ord_2, mapper_ord_3, mapper_ord_4]):
    train[col+'_oe'] = train[col].replace(mapper)
    test[col+'_oe'] = test[col].replace(mapper)
    train.drop(col, axis=1, inplace=True)
    test.drop(col, axis=1, inplace=True)

In [6]:
train['bin_3'] = (train['bin_3'] == 'T').astype(int)
test['bin_3'] = (test['bin_3'] == 'T').astype(int)
train['bin_4'] = (train['bin_4'] == 'Y').astype(int)
test['bin_4'] = (test['bin_4'] == 'Y').astype(int)

In [7]:
oe = OrdinalEncoder(categories='auto')
train['ord_5'] = oe.fit_transform(train[['ord_5']])
test['ord_5'] = oe.transform(test[['ord_5']])

In [8]:
df = pd.concat([train, test])

In [9]:
ordvar = ['ord_0', 'ord_1_oe', 'ord_2_oe', 'ord_3_oe', 'ord_4_oe', 'ord_5']

In [10]:
ss = StandardScaler()
df[ordvar] = ss.fit_transform(df[ordvar])

In [11]:
nomvar = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

In [12]:
dmvar = ['day', 'month']

In [13]:
enc = OneHotEncoder(categories='auto', dtype = 'float64', drop = 'first')
dm_matrix = enc.fit_transform(df[dmvar])
df.drop(dmvar, inplace=True, axis=1)

In [14]:
ohe = OneHotEncoder(categories = 'auto', dtype = 'float64', drop = 'first')
nom_matrix = ohe.fit_transform(df[nomvar])
df.drop(columns=nomvar, inplace=True)
df_sprs =scipy.sparse.hstack([nom_matrix,
                              scipy.sparse.coo_matrix(df).astype('float64'),
                              dm_matrix]).tocsr()

In [15]:
X_train = df_sprs[:y.shape[0]]
X_test = df_sprs[y.shape[0]:]

In [16]:
for C in [0.09, 0.095, 0.1, 0.105, 0.11, 0.115, 0.12]:

    model = LogisticRegression(C=C, solver='lbfgs', verbose=0, n_jobs=-1)


    score = cross_validate(model, X_train, y, cv=3, scoring="roc_auc")
    mean = score['test_score'].mean()
    print(score['test_score'])
    print('C =', C, f'{mean:.8f}')

[0.80020347 0.80464968 0.80218344]
C = 0.09 0.80234553
[0.80026085 0.80468823 0.80213557]
C = 0.095 0.80236155
[0.80023827 0.80474048 0.80218842]
C = 0.1 0.80238906
[0.80032428 0.80468734 0.80225443]
C = 0.105 0.80242202
[0.80033213 0.80467301 0.80221651]
C = 0.11 0.80240722
[0.80029788 0.8047237  0.80221982]
C = 0.115 0.80241380
[0.80028575 0.80470274 0.80216142]
C = 0.12 0.80238330


In [17]:
model = LogisticRegression(C=0.105, solver='lbfgs', verbose=0)
model.fit(X_train, y)



LogisticRegression(C=0.105, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
preds = model.predict_proba(X_test)[:,1]

In [19]:
sample_submission = pd.read_csv('/Users/ama/Desktop/cat-in-the-dat/sample_submission.csv', index_col=0)
sample_submission['target'] = preds
sample_submission.to_csv('best_score.csv')