In [20]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype 

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import scipy
import optuna

In [21]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Subset
test_id = test['id']
target = train['target']
train.drop(['target', 'id', 'bin_0'], axis=1, inplace=True)
test.drop(['id','bin0'], axis=1, inplace=True)

KeyError: "['bin0'] not found in axis"

In [None]:
# dictionary to map the feature
bin_dict = {'T':1, 'F':0, 'Y':1, 'N':0}

# Maping the category values in our dict
train['bin_3'] = train['bin_3'].map(bin_dict)
train['bin_4'] = train['bin_4'].map(bin_dict)
test['bin_3'] = test['bin_3'].map(bin_dict)
test['bin_4'] = test['bin_4'].map(bin_dict)

In [22]:
# seting the orders of our ordinal features
ord_1 = CategoricalDtype(categories=['Novice', 'Contributor','Expert', 
                                     'Master', 'Grandmaster'], ordered=True)
ord_2 = CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot',
                                     'Boiling Hot', 'Lava Hot'], ordered=True)
ord_3 = CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e', 'f', 'g',
                                     'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'], ordered=True)
ord_4 = CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
                                     'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
                                     'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], ordered=True)

In [23]:
# Transforming ordinal Features
train.ord_1 = train.ord_1.astype(ord_1)
train.ord_2 = train.ord_2.astype(ord_2)
train.ord_3 = train.ord_3.astype(ord_3)
train.ord_4 = train.ord_4.astype(ord_4)

# test dataset
test.ord_1 = test.ord_1.astype(ord_1)
test.ord_2 = test.ord_2.astype(ord_2)
test.ord_3 = test.ord_3.astype(ord_3)
test.ord_4 = test.ord_4.astype(ord_4)

In [24]:
# Geting the codes of ordinal categoy's - train
train.ord_1 = train.ord_1.cat.codes
train.ord_2 = train.ord_2.cat.codes
train.ord_3 = train.ord_3.cat.codes
train.ord_4 = train.ord_4.cat.codes

# Geting the codes of ordinal categoy's - test
test.ord_1 = test.ord_1.cat.codes
test.ord_2 = test.ord_2.cat.codes
test.ord_3 = test.ord_3.cat.codes
test.ord_4 = test.ord_4.cat.codes

In [25]:
day_month=['day','month']
for col in day_month:
    train[col+'_sin']=np.sin((2*np.pi*train[col])/max(train[col]))
    train[col+'_cos']=np.cos((2*np.pi*train[col])/max(train[col]))
    test[col+'_sin']=np.sin((2*np.pi*test[col])/max(test[col]))
    test[col+'_cos']=np.cos((2*np.pi*test[col])/max(test[col]))
train=train.drop(day_month,axis=1)
test=test.drop(day_month,axis=1)

In [26]:
all_data = pd.concat((train,test))

In [27]:
encoded=pd.get_dummies(all_data, columns=all_data.columns, sparse=True)
encoded=encoded.sparse.to_coo().tocsr()

In [28]:
train_ohe = encoded[:len(train)]
test_ohe = encoded[len(train):]

In [29]:
model=LogisticRegression(C=0.12299878403389289, solver="liblinear", max_iter=10000)
model.fit(train_ohe, target)
pred=model.predict_proba(test_ohe)[:,1]

In [30]:
from sklearn.model_selection import cross_validate
score=cross_val_score(model, train_ohe, target,scoring="roc_auc", n_jobs=-1).mean()
print(score)

0.8035991313668722


In [14]:
def objective(trial):
    C=trial.suggest_loguniform('C', 10e-10, 10)
    model=LogisticRegression(C=C,max_iter=10000, solver='liblinear')
    score=-cross_val_score(model, train_ohe, target, scoring='roc_auc', n_jobs=-1).mean()
    return score

study=optuna.create_study()
study.optimize(objective, n_trials=100)
tuned_C=study.best_params

[I 2020-04-28 13:30:06,217] Finished trial#0 with value: -0.6129950850980826 with parameters: {'C': 7.946164580413913e-09}. Best is trial#0 with value: -0.6129950850980826.
[I 2020-04-28 13:30:07,828] Finished trial#1 with value: -0.740319421414663 with parameters: {'C': 0.0002167968974349058}. Best is trial#1 with value: -0.740319421414663.
[I 2020-04-28 13:30:08,663] Finished trial#2 with value: -0.6854744020219706 with parameters: {'C': 6.524880868257699e-06}. Best is trial#1 with value: -0.740319421414663.
[I 2020-04-28 13:30:10,838] Finished trial#3 with value: -0.7668681238551768 with parameters: {'C': 0.0014406295637030904}. Best is trial#3 with value: -0.7668681238551768.
[I 2020-04-28 13:30:11,456] Finished trial#4 with value: -0.6140557909199682 with parameters: {'C': 5.410234676367516e-08}. Best is trial#3 with value: -0.7668681238551768.
[I 2020-04-28 13:30:12,078] Finished trial#5 with value: -0.6128428005386504 with parameters: {'C': 1.3557809442551843e-09}. Best is trial

[I 2020-04-28 13:38:40,471] Finished trial#47 with value: -0.8033881744846454 with parameters: {'C': 0.08586733631640169}. Best is trial#29 with value: -0.8035853544893692.
[I 2020-04-28 13:39:17,070] Finished trial#48 with value: -0.7903907441745828 with parameters: {'C': 4.157821236681964}. Best is trial#29 with value: -0.8035853544893692.
[I 2020-04-28 13:39:19,914] Finished trial#49 with value: -0.7751604636962192 with parameters: {'C': 0.0025447768739887103}. Best is trial#29 with value: -0.8035853544893692.
[I 2020-04-28 13:39:24,780] Finished trial#50 with value: -0.7923747322445505 with parameters: {'C': 0.010626544944678266}. Best is trial#29 with value: -0.8035853544893692.
[I 2020-04-28 13:39:35,512] Finished trial#51 with value: -0.8035923802452313 with parameters: {'C': 0.12751807867901743}. Best is trial#51 with value: -0.8035923802452313.
[I 2020-04-28 13:39:46,563] Finished trial#52 with value: -0.80359364295127 with parameters: {'C': 0.11420316475909739}. Best is trial

[I 2020-04-28 13:49:08,606] Finished trial#94 with value: -0.8035577840623638 with parameters: {'C': 0.13934225806899986}. Best is trial#74 with value: -0.8035978228038969.
[I 2020-04-28 13:49:14,952] Finished trial#95 with value: -0.7996785976698713 with parameters: {'C': 0.028401127859094166}. Best is trial#74 with value: -0.8035978228038969.
[I 2020-04-28 13:49:23,182] Finished trial#96 with value: -0.8030749465996264 with parameters: {'C': 0.07074229406086759}. Best is trial#74 with value: -0.8035978228038969.
[I 2020-04-28 13:49:43,548] Finished trial#97 with value: -0.7986571296847575 with parameters: {'C': 0.6877750600740542}. Best is trial#74 with value: -0.8035978228038969.
[I 2020-04-28 13:49:54,228] Finished trial#98 with value: -0.8035660614838805 with parameters: {'C': 0.1369223780375913}. Best is trial#74 with value: -0.8035978228038969.
[I 2020-04-28 13:49:58,062] Finished trial#99 with value: -0.7907551087341652 with parameters: {'C': 0.008994186025698328}. Best is tria

In [15]:
model=LogisticRegression(C=tuned_C['C'], solver="liblinear", max_iter=10000)
model.fit(train_ohe, target)
pred=model.predict_proba(test_ohe)[:,1]

In [31]:
from sklearn.model_selection import cross_validate
score=cross_val_score(model, train_ohe, target, scoring="roc_auc", n_jobs=-1).mean()
print(score)

0.8035991313668722


In [32]:
submission = pd.DataFrame({'id': test_id, 'target': pred})
submission.to_csv('submission.csv', index=False)

In [33]:
tuned_C['C']

0.12299878403389289