In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import scipy
import optuna

In [2]:
# Load data 
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Define Subset
test_id = test['id']
target = train['target']

#drop target and id from train, as it is not a feature.
train.drop(['target', 'id'], axis=1, inplace=True) 
test.drop(['id'], axis=1, inplace=True)

In [3]:
# convert bin_3 and bin_4 to numeric data
bin_dict = {'T':1, 'F':0, 'Y':1, 'N':0}

# Maping the category values in bin dict
train['bin_3'] = train['bin_3'].map(bin_dict)
train['bin_4'] = train['bin_4'].map(bin_dict)
test['bin_3'] = test['bin_3'].map(bin_dict)
test['bin_4'] = test['bin_4'].map(bin_dict)

In [4]:
# seting the orders of our ordinal features
ord_1 = CategoricalDtype(categories=['Novice', 'Contributor','Expert', 
                                     'Master', 'Grandmaster'], ordered=True)
ord_2 = CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot',
                                     'Boiling Hot', 'Lava Hot'], ordered=True)
ord_3 = CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e', 'f', 'g',
                                     'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'], ordered=True)
ord_4 = CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
                                     'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
                                     'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], ordered=True)

In [5]:
# Transforming ordinal Features
train.ord_1 = train.ord_1.astype(ord_1)
train.ord_2 = train.ord_2.astype(ord_2)
train.ord_3 = train.ord_3.astype(ord_3)
train.ord_4 = train.ord_4.astype(ord_4)

# test dataset
test.ord_1 = test.ord_1.astype(ord_1)
test.ord_2 = test.ord_2.astype(ord_2)
test.ord_3 = test.ord_3.astype(ord_3)
test.ord_4 = test.ord_4.astype(ord_4)

In [6]:
# Geting the codes of ordinal categoy's - train
train.ord_1 = train.ord_1.cat.codes
train.ord_2 = train.ord_2.cat.codes
train.ord_3 = train.ord_3.cat.codes
train.ord_4 = train.ord_4.cat.codes

# Geting the codes of ordinal categoy's - test
test.ord_1 = test.ord_1.cat.codes
test.ord_2 = test.ord_2.cat.codes
test.ord_3 = test.ord_3.cat.codes
test.ord_4 = test.ord_4.cat.codes

In [7]:
# combine train&test data set for OHE
all_data = pd.concat((train,test))

In [8]:
# OHE by using pandas.get_dummy
encoded=pd.get_dummies(all_data, columns=all_data.columns, sparse=True)
# Convert to sparse data structure to avoid memory issue when train data later
encoded=encoded.sparse.to_coo().tocsr()

In [9]:
# divide data to train and test seperatly
train_ohe = encoded[:len(train)]
test_ohe = encoded[len(train):]

In [10]:
# using logistic regression with "liblinear" solver and C value as 0.1 we will tune the C value later with optuna
model=LogisticRegression(C=0.1, solver="liblinear", max_iter=10000)
model.fit(train_ohe, target)
pred=model.predict_proba(test_ohe)[:,1]

In [11]:
# scoring by roc_auc. because kaggle scoring submission by roc_auc
score=cross_val_score(model, train_ohe, target, scoring='roc_auc', n_jobs=-1).mean()
print(score)

0.8035356471423679


In [16]:
# tune C 
def objective(trial):
    C=trial.suggest_loguniform('C', 10e-10, 10)
    model=LogisticRegression(C=C,max_iter=10000, solver='liblinear')
    score=-cross_val_score(model, train_ohe, target, scoring='roc_auc', n_jobs=-1).mean()
    return score

study=optuna.create_study()
study.optimize(objective, n_trials=100)
tuned_C=study.best_params

[I 2020-04-28 01:22:49,238] Finished trial#0 with value: -0.6124161477367991 with parameters: {'C': 9.052164443557401e-08}. Best is trial#0 with value: -0.6124161477367991.
[I 2020-04-28 01:22:50,076] Finished trial#1 with value: -0.7067790799463448 with parameters: {'C': 1.8029132225858717e-05}. Best is trial#1 with value: -0.7067790799463448.
[I 2020-04-28 01:22:50,872] Finished trial#2 with value: -0.6377306682949584 with parameters: {'C': 1.4928911569366931e-06}. Best is trial#1 with value: -0.7067790799463448.
[I 2020-04-28 01:23:28,760] Finished trial#3 with value: -0.7884433400416202 with parameters: {'C': 7.331541845961787}. Best is trial#3 with value: -0.7884433400416202.
[I 2020-04-28 01:23:29,525] Finished trial#4 with value: -0.6612803055360361 with parameters: {'C': 3.4835538616631648e-06}. Best is trial#3 with value: -0.7884433400416202.
[I 2020-04-28 01:23:33,270] Finished trial#5 with value: -0.789826676827512 with parameters: {'C': 0.008200744838816776}. Best is trial#

[I 2020-04-28 01:30:16,564] Finished trial#47 with value: -0.8032757732743125 with parameters: {'C': 0.07920526724210451}. Best is trial#37 with value: -0.8035709820246815.
[I 2020-04-28 01:30:53,827] Finished trial#48 with value: -0.7885046889598188 with parameters: {'C': 7.186420414834248}. Best is trial#37 with value: -0.8035709820246815.
[I 2020-04-28 01:30:59,299] Finished trial#49 with value: -0.7986114578734036 with parameters: {'C': 0.02364674915469299}. Best is trial#37 with value: -0.8035709820246815.
[I 2020-04-28 01:31:23,231] Finished trial#50 with value: -0.794663035809732 with parameters: {'C': 1.551486067209407}. Best is trial#37 with value: -0.8035709820246815.
[I 2020-04-28 01:31:33,412] Finished trial#51 with value: -0.803598942953555 with parameters: {'C': 0.12131122803096789}. Best is trial#51 with value: -0.803598942953555.
[I 2020-04-28 01:31:49,676] Finished trial#52 with value: -0.7996579568889842 with parameters: {'C': 0.5558999840663859}. Best is trial#51 wit

[I 2020-04-28 01:39:53,337] Finished trial#94 with value: -0.8035811911524317 with parameters: {'C': 0.10919163030227791}. Best is trial#71 with value: -0.8035990282437903.
[I 2020-04-28 01:39:58,200] Finished trial#95 with value: -0.797494760282544 with parameters: {'C': 0.01992614699201391}. Best is trial#71 with value: -0.8035990282437903.
[I 2020-04-28 01:40:01,514] Finished trial#96 with value: -0.7865276623681889 with parameters: {'C': 0.006071936381957073}. Best is trial#71 with value: -0.8035990282437903.
[I 2020-04-28 01:40:07,691] Finished trial#97 with value: -0.8007500799261035 with parameters: {'C': 0.03505995356987197}. Best is trial#71 with value: -0.8035990282437903.
[I 2020-04-28 01:40:15,829] Finished trial#98 with value: -0.8029487311935048 with parameters: {'C': 0.06657542089488627}. Best is trial#71 with value: -0.8035990282437903.
[I 2020-04-28 01:40:30,459] Finished trial#99 with value: -0.8013394157784817 with parameters: {'C': 0.3707677113838552}. Best is trial

In [13]:
model=LogisticRegression(C=tuned_C['C'], solver="liblinear", max_iter=10000)
model.fit(train_ohe, target)
pred=model.predict_proba(test_ohe)[:,1]

In [14]:
from sklearn.model_selection import cross_validate
score=cross_val_score(model, train_ohe, target, scoring="roc_auc")["test_score"].mean()
print(score)

0.8035653984165766


In [15]:
submission = pd.DataFrame({'id': test_id, 'target': pred})
submission.to_csv('submission.csv', index=False)