# Kaggle Categorical Feature Encoding Challenge
#### *Binary classification, with every feature a categorical*

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Load data

In [None]:
train_raw = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Combine Datasets during FE

In [None]:
train_y = train_raw['target']
train_raw = train_raw.drop(['target'], axis = 1)
train = train_raw.append(test)

## Feature Engineering

#### Check unique values per feature between train and test sets

In [None]:
print("column: train unique values - test unique values")
for column in train.columns:
    print(column + ": " + str(len(train[:300000][column].unique())) + " --> " + str(len(train[300000:][column].unique())))

#### Label Encode to binary features

In [None]:
mapping_bins34 = {'T' : 1, 'F' : 0, 'Y' : 1, 'N' : 0}

train['bin_3'] = train['bin_3'].map(mapping_bins34)
train['bin_4'] = train['bin_4'].map(mapping_bins34)

#### Ordinal encoding for ordinal features with low cardinality

In [None]:
for col in ['ord_1','ord_2']:
    print(col)
    print(train[col].unique())

In [None]:
ord1_dic = {'Novice' : 0, 'Contributor' : 1, 'Expert' : 2, 'Master' : 3, 'Grandmaster' : 4}
ord2_dic = {'Freezing' : 0, 'Cold' : 1, 'Warm' : 2, 'Hot' : 3, 'Boiling Hot' : 4, 'Lava Hot' : 5}

train['ord_1'] = train['ord_1'].map(ord1_dic)
train['ord_2'] = train['ord_2'].map(ord2_dic)

#### For mid cardinality ordinal features, sort then map to new values

In [None]:
#sort ord_3 alphabetically then insert into dic then map new values
ord3 = sorted(train['ord_3'].unique())
ord3_dic = {}

i=1
for letter in ord3:
    ord3_dic[letter] = i
    i += 1
    
train['ord_3'] = train['ord_3'].map(ord3_dic)

In [None]:
#sort ord_4 alphabetically then insert into dic then map new values
ord4 = sorted(train['ord_4'].unique())
ord4_dic = {}

i=1
for letter in ord4:
    ord4_dic[letter] = i
    i += 1
    
train['ord_4'] = train['ord_4'].map(ord4_dic)

#### For high cardinality ordinal feature, add indices for 2 letters

In [None]:
train['ord_5_oe_add'] = train['ord_5'].apply(lambda x:sum([(string.ascii_letters.find(letter)+1) for letter in x]))
train = train.drop(['ord_5'], axis = 1)

#### One Hot Encoding to low cardinality nominal features

In [None]:
#One hot encode nominal features
train = pd.get_dummies(train, columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'],\
                          prefix=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], drop_first=True)

#### Hashing to high cardinality nominal features

In [None]:
high_card_nom = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

for col in high_card_nom:
    train[f'hash_{col}'] = train[col].apply( lambda x: hash(str(x)) % 5000 )
    train = train.drop([col], axis = 1)

## Split Train and Test apart after feature engineering

In [None]:
train_final = train[:len(train_raw)]
test_final = train[(len(train_raw)):]

## Split Training Set to Train/Test

In [None]:
X_train,X_test,y_train,y_test = train_test_split(train_final,train_y,test_size=0.2)

## Grid Search

In [None]:
# from sklearn.model_selection import GridSearchCV
# grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"],'solver':"lbfgs"} # l1 lasso l2 ridge
# clf=LogisticRegression()
# clf_cv=GridSearchCV(clf,grid,cv=10)
# clf_cv.fit(X_train,y_train)

In [None]:
# print("tuned hpyerparameters :(best parameters) ",clf_cv.best_params_)
# print("accuracy :",clf_cv.best_score_)

## Build Final Model

In [None]:
clf = LogisticRegression(C=0.1, solver="lbfgs", max_iter=5000, n_jobs=-1)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict_proba(X_test)[:, 1]

## Final Submission

In [46]:
final_pred = clf.predict_proba(test_final)[:, 1]

In [49]:
pd.DataFrame({"id": test['id'], "target": final_pred}).to_csv("submission.csv", index=False)