In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import KFold
from sklearn import base

In [2]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')

In [3]:
X=df_train.drop(['target'],axis=1)
y=df_train['target']

In [6]:
def logistic(X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)
    lr=LogisticRegression(max_iter=10000)
    lr.fit(X_train,y_train)
    y_pre=lr.predict(X_test)
    print('Accuracy : ',accuracy_score(y_test,y_pre))
    print('auc: ', auc(y_test,y_pre))

## 1. Label Encoding ##

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
%%time

train=pd.DataFrame()
label=LabelEncoder()
for c in  X.columns:
    if(X[c].dtype=='object'):
        train[c]=label.fit_transform(X[c])
    else:
        train[c]=X[c]

Wall time: 926 ms


In [7]:
%%time
logistic(train,y)

Accuracy :  0.69065
auc:  0.5072521140446195
Wall time: 2.38 s


## 2. On hot encoding ##

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
%%time 

one=OneHotEncoder()

one.fit(X)
train=one.transform(X)

Wall time: 1.52 s


In [10]:
%%time
logistic(train,y)

Accuracy :  0.7594
auc:  0.6810257645636164
Wall time: 1min 20s


## 3.Feature hashing (a.k.a the hashing trick) ##

In [11]:
from sklearn.feature_extraction import FeatureHasher

In [12]:
%%time

X_train_hash=X.copy()
for c in X.columns:
    X_train_hash[c]=X[c].astype('str')      
hashing=FeatureHasher(input_type='string')
train=hashing.transform(X_train_hash.values)

Wall time: 4.4 s


In [13]:
%%time
logistic(train,y)

Accuracy :  0.7512666666666666
auc:  0.6690547832496434
Wall time: 3min 43s


## 4.Encoding categories with dataset statistics ##

In [14]:
%%time

X_train_stat=X.copy()
for c in X_train_stat.columns:
    if(X_train_stat[c].dtype=='object'):
        X_train_stat[c]=X_train_stat[c].astype('category')
        counts=X_train_stat[c].value_counts()
        counts=counts.sort_index()
        counts=counts.fillna(0)
        counts += np.random.rand(len(counts))/1000
        X_train_stat[c].cat.categories=counts

Wall time: 696 ms


In [15]:
logistic(X_train_stat,y)

Accuracy :  0.6946
auc:  0.49998800297526214


## 5.Encoding cyclic features + On hot encoding on other features ##

In [17]:
%%time

X_train_cyclic=X.copy()
columns=['day','month']
for col in columns:
    X_train_cyclic[col+'_sin']=np.sin((2*np.pi*X_train_cyclic[col])/max(X_train_cyclic[col]))
    X_train_cyclic[col+'_cos']=np.cos((2*np.pi*X_train_cyclic[col])/max(X_train_cyclic[col]))
X_train_cyclic=X_train_cyclic.drop(columns,axis=1)

Wall time: 266 ms


In [18]:
one=OneHotEncoder()

one.fit(X_train_cyclic)
train=one.transform(X_train_cyclic)

In [22]:
logistic(train,y)

Accuracy :  0.7594
auc:  0.6810257645636164


## 6. Target encoding ##

In [23]:
%%time

X_target=df_train.copy()
X_target['day']=X_target['day'].astype('object')
X_target['month']=X_target['month'].astype('object')
for col in X_target.columns:
    if (X_target[col].dtype=='object'):
        target= dict ( X_target.groupby(col)['target'].agg('sum')/X_target.groupby(col)['target'].agg('count'))
        X_target[col]=X_target[col].replace(target).values

Wall time: 2min 38s


In [24]:
logistic(X_target.drop('target',axis=1),y)

Accuracy :  0.6946166666666667
auc:  0.5


In [25]:
X['target']=y
cols=X.drop(['target','id'],axis=1).columns

In [27]:
%%time

X_fold=X.copy()
X_fold[['ord_0','day','month']]=X_fold[['ord_0','day','month']].astype('object')
X_fold[['bin_3','bin_4']]=X_fold[['bin_3','bin_4']].replace({'Y':1,'N':0,'T':1,"F":0})
kf = KFold(n_splits = 5, shuffle = True, random_state=2019)
for train_ind,val_ind in kf.split(X):
    for col in cols:
        if(X_fold[col].dtype=='object'):
            replaced=dict(X.iloc[train_ind][[col,'target']].groupby(col)['target'].mean())
            X_fold.loc[val_ind,col]=X_fold.iloc[val_ind][col].replace(replaced).values

Wall time: 2min 33s
