In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import scipy
import optuna

In [2]:
# Load data 
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Define Subset
test_id = test['id']
target = train['target']

#drop target and id from train, as it is not a feature.
train.drop(['target', 'id'], axis=1, inplace=True) 
test.drop(['id'], axis=1, inplace=True)

In [3]:
# convert bin_3 and bin_4 to numeric data
bin_dict = {'T':1, 'F':0, 'Y':1, 'N':0}

# Maping the category values in bin dict
train['bin_3'] = train['bin_3'].map(bin_dict)
train['bin_4'] = train['bin_4'].map(bin_dict)
test['bin_3'] = test['bin_3'].map(bin_dict)
test['bin_4'] = test['bin_4'].map(bin_dict)

In [4]:
# seting the orders of our ordinal features
ord_1 = CategoricalDtype(categories=['Novice', 'Contributor','Expert', 
                                     'Master', 'Grandmaster'], ordered=True)
ord_2 = CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot',
                                     'Boiling Hot', 'Lava Hot'], ordered=True)
ord_3 = CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e', 'f', 'g',
                                     'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'], ordered=True)
ord_4 = CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
                                     'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
                                     'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], ordered=True)

In [5]:
# Transforming ordinal Features
train.ord_1 = train.ord_1.astype(ord_1)
train.ord_2 = train.ord_2.astype(ord_2)
train.ord_3 = train.ord_3.astype(ord_3)
train.ord_4 = train.ord_4.astype(ord_4)

# test dataset
test.ord_1 = test.ord_1.astype(ord_1)
test.ord_2 = test.ord_2.astype(ord_2)
test.ord_3 = test.ord_3.astype(ord_3)
test.ord_4 = test.ord_4.astype(ord_4)

In [6]:
# Geting the codes of ordinal categoy's - train
train.ord_1 = train.ord_1.cat.codes
train.ord_2 = train.ord_2.cat.codes
train.ord_3 = train.ord_3.cat.codes
train.ord_4 = train.ord_4.cat.codes

# Geting the codes of ordinal categoy's - test
test.ord_1 = test.ord_1.cat.codes
test.ord_2 = test.ord_2.cat.codes
test.ord_3 = test.ord_3.cat.codes
test.ord_4 = test.ord_4.cat.codes

In [7]:
# combine train&test data set for OHE
all_data = pd.concat((train,test))

In [8]:
# OHE by using pandas.get_dummy
encoded=pd.get_dummies(all_data, columns=all_data.columns, sparse=True)
# Convert to sparse data structure to avoid memory issue when train data later
encoded=encoded.sparse.to_coo().tocsr()

In [9]:
# divide data to train and test seperatly
train_ohe = encoded[:len(train)]
test_ohe = encoded[len(train):]

In [10]:
# using logistic regression with "liblinear" solver and C value as 0.1 we will tune the C value later with optuna
model=LogisticRegression(C=0.1, solver="liblinear", max_iter=10000)
model.fit(train_ohe, target)
pred=model.predict_proba(test_ohe)[:,1]

In [11]:
# scoring by roc_auc. because kaggle scoring submission by roc_auc
score=cross_val_score(model, train_ohe, target, scoring='roc_auc', n_jobs=-1).mean()
print(score)

0.8035356471423679


In [None]:
# tune C 
def objective(trial):
    C=trial.suggest_loguniform('C', 10e-10, 10)
    model=LogisticRegression(C=C,max_iter=10000, solver='liblinear')
    score=-cross_val_score(model, train_ohe, target, scoring='roc_auc', n_jobs=-1).mean()
    return score

study=optuna.create_study()
study.optimize(objective, n_trials=50)
tuned_C=study.best_params

[I 2020-04-28 01:12:49,339] Finished trial#0 with value: -0.6269912557824503 with parameters: {'C': 8.36495811733918e-07}. Best is trial#0 with value: -0.6269912557824503.
[I 2020-04-28 01:12:50,567] Finished trial#1 with value: -0.6374593926169917 with parameters: {'C': 1.474885074500612e-06}. Best is trial#1 with value: -0.6374593926169917.
[I 2020-04-28 01:12:51,499] Finished trial#2 with value: -0.6162433304956878 with parameters: {'C': 2.7350116584933465e-07}. Best is trial#1 with value: -0.6374593926169917.
[I 2020-04-28 01:13:07,086] Finished trial#3 with value: -0.8009083710430895 with parameters: {'C': 0.41448134018812627}. Best is trial#3 with value: -0.8009083710430895.
[I 2020-04-28 01:13:11,302] Finished trial#4 with value: -0.7942049218948997 with parameters: {'C': 0.013012229791217578}. Best is trial#3 with value: -0.8009083710430895.
[I 2020-04-28 01:13:11,905] Finished trial#5 with value: -0.610964589780365 with parameters: {'C': 2.3135971790117303e-08}. Best is trial#

In [None]:
model=LogisticRegression(C=tuned_C['C'], solver="liblinear", max_iter=10000)
model.fit(train_ohe, target)
pred=model.predict_proba(test_ohe)[:,1]

In [None]:
from sklearn.model_selection import cross_validate
score=cross_val_score(model, train_ohe, target, scoring="roc_auc")["test_score"].mean()
print(score)

In [None]:
submission = pd.DataFrame({'id': test_id, 'target': pred})
submission.to_csv('submission.csv', index=False)