In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype 

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import scipy
import optuna

In [2]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Subset
test_id = test['id']
target = train['target']
train.drop(['target', 'id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [3]:
# dictionary to map the feature
bin_dict = {'T':1, 'F':0, 'Y':1, 'N':0}

# Maping the category values in our dict
train['bin_3'] = train['bin_3'].map(bin_dict)
train['bin_4'] = train['bin_4'].map(bin_dict)
test['bin_3'] = test['bin_3'].map(bin_dict)
test['bin_4'] = test['bin_4'].map(bin_dict)

In [4]:
# seting the orders of our ordinal features
ord_1 = CategoricalDtype(categories=['Novice', 'Contributor','Expert', 
                                     'Master', 'Grandmaster'], ordered=True)
ord_2 = CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot',
                                     'Boiling Hot', 'Lava Hot'], ordered=True)
ord_3 = CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e', 'f', 'g',
                                     'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'], ordered=True)
ord_4 = CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
                                     'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
                                     'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], ordered=True)

In [5]:
# Transforming ordinal Features
train.ord_1 = train.ord_1.astype(ord_1)
train.ord_2 = train.ord_2.astype(ord_2)
train.ord_3 = train.ord_3.astype(ord_3)
train.ord_4 = train.ord_4.astype(ord_4)

# test dataset
test.ord_1 = test.ord_1.astype(ord_1)
test.ord_2 = test.ord_2.astype(ord_2)
test.ord_3 = test.ord_3.astype(ord_3)
test.ord_4 = test.ord_4.astype(ord_4)

In [6]:
# Geting the codes of ordinal categoy's - train
train.ord_1 = train.ord_1.cat.codes
train.ord_2 = train.ord_2.cat.codes
train.ord_3 = train.ord_3.cat.codes
train.ord_4 = train.ord_4.cat.codes

# Geting the codes of ordinal categoy's - test
test.ord_1 = test.ord_1.cat.codes
test.ord_2 = test.ord_2.cat.codes
test.ord_3 = test.ord_3.cat.codes
test.ord_4 = test.ord_4.cat.codes

In [7]:
all_data = pd.concat((train,test))

In [8]:
encoded=pd.get_dummies(all_data, columns=all_data.columns, sparse=True)
encoded=encoded.sparse.to_coo().tocsr()

In [9]:
train_ohe = encoded[:len(train)]
test_ohe = encoded[len(train):]

In [10]:
model=LogisticRegression(C=0.1, solver="liblinear", max_iter=10000)
model.fit(train_ohe, target)
pred=model.predict_proba(test_ohe)[:,1]

In [11]:
from sklearn.model_selection import cross_validate
score=cross_val_score(model, train_ohe, target,scoring="roc_auc", n_jobs=-1)["test_score"].mean()
print(score)

0.8035368970938169


In [12]:
def objective(trial):
    C=trial.suggest_loguniform('C', 10e-10, 10)
    model=LogisticRegression(C=C,max_iter=10000, solver='liblinear')
    score=-cross_val_score(model, train_ohe, target, scoring='roc_auc', n_jobs=-1).mean()
    return score

study=optuna.create_study()
study.optimize(objective, n_trials=50)
tuned_C=study.best_params

[I 2020-04-27 00:45:48,981] Finished trial#0 with value: -0.6209980775989218 with parameters: {'C': 1.607322614889113e-08}. Best is trial#0 with value: -0.6209980775989218.
[I 2020-04-27 00:45:50,083] Finished trial#1 with value: -0.6230193522015571 with parameters: {'C': 1.200674810376953e-07}. Best is trial#1 with value: -0.6230193522015571.
[I 2020-04-27 00:46:02,358] Finished trial#2 with value: -0.8028266975200701 with parameters: {'C': 0.2291437228447997}. Best is trial#2 with value: -0.8028266975200701.
[I 2020-04-27 00:46:03,265] Finished trial#3 with value: -0.6217697557441343 with parameters: {'C': 5.547748576603366e-08}. Best is trial#2 with value: -0.8028266975200701.
[I 2020-04-27 00:46:04,057] Finished trial#4 with value: -0.6715140596154301 with parameters: {'C': 4.1542162523346475e-06}. Best is trial#2 with value: -0.8028266975200701.
[I 2020-04-27 00:46:07,196] Finished trial#5 with value: -0.7866609496453064 with parameters: {'C': 0.0061418381506284205}. Best is trial

In [13]:
model=LogisticRegression(C=tuned_C['C'], solver="liblinear", max_iter=10000)
model.fit(train_ohe, target)
pred=model.predict_proba(test_ohe)[:,1]

In [14]:
from sklearn.model_selection import cross_validate
score=cross_val_score(model, train_ohe, target, scoring="roc_auc", n_jobs=-1)["test_score"].mean()
print(score)

0.8036004629447024


In [15]:
submission = pd.DataFrame({'id': test_id, 'target': pred})
submission.to_csv('submission.csv', index=False)