In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,299995,0,0,0,T,N,Red,Trapezoid,Snake,India,...,e027decef,1,Contributor,Freezing,k,K,dh,3,8,0
299996,299996,0,0,0,F,Y,Green,Trapezoid,Lion,Russia,...,80f1411c8,2,Novice,Freezing,h,W,MO,3,2,0
299997,299997,0,0,0,F,Y,Blue,Star,Axolotl,Russia,...,314dcc15b,3,Novice,Boiling Hot,o,A,Bn,7,9,1
299998,299998,0,1,0,F,Y,Green,Square,Axolotl,Costa Rica,...,ab0ce192b,1,Master,Boiling Hot,h,W,uJ,3,8,1


In [3]:
def split_sets(df, seed=5):
    Y = df.target.values
    X = df.drop(columns=['target'])
    x_train_val, x_test, y_train_val, y_test = train_test_split(
        X, Y, test_size=0.15, random_state=seed)
    x_train_val = x_train_val.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    x_train, x_val, y_train, y_val = train_test_split(
    x_train_val, y_train_val, test_size=0.20, random_state=seed)
    return  x_train_val, x_train, x_test, x_val, y_train_val, y_train, y_test, y_val

x_train_val, x_train, x_test, x_val, y_train_val, y_train, y_test, y_val \
= split_sets(df, seed=5)
x_train.shape, x_val.shape, x_train_val.shape, x_test.shape

((204000, 24), (51000, 24), (255000, 24), (45000, 24))

In [4]:
def drop_column(x_train_val, x_train, x_test, x_val, col):
    x_train = x_train.drop(columns=[col])
    x_val = x_val.drop(columns=[col])
    x_train_val = x_train_val.drop(columns=[col])
    x_test = x_test.drop(columns=[col])
    return x_train_val, x_train, x_test, x_val

x_train_val, x_train, x_test, x_val = drop_column(
    x_train_val, x_train, x_test, x_val, 'id')

In [5]:
def change_col(x_train_val, x_train, x_test, x_val, col):
    x_train[col] = x_train[col].astype('category')
    x_val[col] = x_val[col].astype('category')
    x_train_val[col] = x_train_val[col].astype('category')
    x_test[col] = x_test[col].astype('category')
    categories = df[col].unique()
    x_train[col] = x_train[col].cat.rename_categories({categories[0]:1, categories[1]:0})
    x_val[col] = x_val[col].cat.rename_categories({categories[0]:1, categories[1]:0})
    x_train_val[col] = x_train_val[col].cat.rename_categories({categories[0]:1, categories[1]:0})
    x_test[col] = x_test[col].cat.rename_categories({categories[0]:1, categories[1]:0})
    return x_train_val, x_train, x_test, x_val

x_train_val, x_train, x_test, x_val = change_col(x_train_val, x_train, x_test, x_val, 'bin_3')
x_train_val, x_train, x_test, x_val = change_col(x_train_val, x_train, x_test, x_val, 'bin_4')

In [6]:
def label_encoding_with_UNK(col_train, UNK=True):
    """ Returns a label encoding "UNK" values
    """
    le = LabelEncoder()
    uniq = np.unique(col_train)
    if UNK:
        uniq = np.concatenate((np.array(["UNK"]), uniq))
    le.fit(uniq)
    return le


def transform_column(le, index, x_train, x_val, x_train_val, x_test):
    x_train[index] = le.transform(x_train[index])
    val = [x if x in le.classes_ else 'UNK' for x in x_val[index]]
    x_val[index] = le.transform(val)
    train_val = [x if x in le.classes_ else 'UNK'
                 for x in x_train_val[index]]
    x_train_val[index] = le.transform(train_val)
    test = [x if x in le.classes_ else 'UNK' for x in x_test[index]]
    x_test[index] = le.transform(test)


def hashing_trick(col, n_features=3):
    name = col.name
    col = col.astype('string')
    col_names = [name + "_" + str(i+1) for i in range(n_features)]
    h = FeatureHasher(input_type='string', n_features=n_features)
    out = h.transform(col).toarray()
    return pd.DataFrame(out, columns=col_names)

In [8]:
cols_label_encoding = '’nom_0’, ’nom_1’, ’nom_2’, ’nom_3’, ’nom_4’, ’nom_5’, ’nom_6’, \
’nom_7’, ’nom_8’, ’nom_9’, ’ord_1’, ’ord_2’'
cols_feat_hashing = [x.strip(' ') for x in cols_label_encoding.replace('’'," ").split(',')]
cols_label_encoding = ['ord_0', 'ord_3', 'ord_4', 'ord_5']
cols_numerical = ['bin_0', 'bin_1', 'bin_2', 'ord_0', 'day', 'month']
cols_feat_hashing, cols_label_encoding, cols_numerical

(['nom_0',
  'nom_1',
  'nom_2',
  'nom_3',
  'nom_4',
  'nom_5',
  'nom_6',
  'nom_7',
  'nom_8',
  'nom_9',
  'ord_1',
  'ord_2'],
 ['ord_0', 'ord_3', 'ord_4', 'ord_5'],
 ['bin_0', 'bin_1', 'bin_2', 'ord_0', 'day', 'month'])

In [9]:
x_train_hash = x_train[cols_feat_hashing].copy()
x_val_hash = x_val[cols_feat_hashing].copy()
x_test_hash = x_test[cols_feat_hashing].copy()
x_train_hash.head()

Unnamed: 0,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_1,ord_2
158960,Red,Polygon,Lion,Finland,Piano,dc07effb0,d173ac7ca,ec69236eb,1984d519a,3b967a668,Grandmaster,Freezing
40740,Green,Trapezoid,Lion,Costa Rica,Oboe,fd04a970f,2df6f79a2,12b92841d,19a7677f3,8b1f75d90,Novice,Cold
8162,Green,Triangle,Lion,Costa Rica,Piano,2cadfed8e,afebf0803,370b29add,f1de422cc,db8bacb11,Novice,Lava Hot
77267,Blue,Trapezoid,Snake,Russia,Oboe,b97f51ac4,f497b97d7,6e0e3ec45,6da888acf,b6bb569c7,Novice,Freezing
70843,Red,Trapezoid,Lion,Canada,Oboe,a93b89fc9,4daee3baf,479b4bade,06b7e7cb3,02a74a666,Contributor,Hot


In [10]:
cv = KFold(n_splits=5, random_state=13, shuffle=True)
d = [3, 5, 10, 15, 20]
x_train = pd.concat([x_train, x_val])
x_train_hash = x_train[cols_feat_hashing].copy()
x_test_hash = x_test[cols_feat_hashing].copy()
# delete feature hashing columns from the main dataframes
x_train = x_train.drop(columns=cols_feat_hashing)
x_test = x_test.drop(columns=cols_feat_hashing)
x_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)

for f in cols_label_encoding:
    le = label_encoding_with_UNK(x_train[f].values, UNK=False)
    transform_column(le, f, x_train, x_val, x_train_val, x_test)
x_train.reset_index(drop=True, inplace=True)
x_val.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
y_train_val = np.concatenate([y_train, y_val], axis = None)
y_train_val[:10]

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 0])

In [11]:
avg_auc = {}
for di in d:
    x_train_hashed = pd.concat([hashing_trick(x_train_hash[f], n_features=di)
                                for f in cols_feat_hashing], axis=1)

    x_train_hashed = pd.concat([x_train, x_train_hashed], axis=1)
    
    X = x_train_hashed
    X.reset_index(drop=True, inplace=True)
    Y = y_train_val
    
    scores = []
    for train_index, test_index in cv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        scaler = StandardScaler() # creates the scaler
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        clf = LogisticRegression(random_state=13, C=1).fit(X_train, Y_train)
        y_hat = clf.predict_proba(X_test)[:,1]
        auc = roc_auc_score(Y_test, y_hat)
        scores.append(auc)
    avg_auc[di] = sum(scores)/len(scores)
avg_auc  

{3: 0.7568530617854234,
 5: 0.7630020906218318,
 10: 0.7652803337916716,
 15: 0.7651298614297974,
 20: 0.7656782724780753}

# 1.4
The Dimension with the Largest AUC was d=20

In [12]:
best_d = 20
x_final_hashed = pd.concat([hashing_trick(x_train_hash[f],n_features=best_d)
                                for f in cols_feat_hashing], axis=1)
x_final = pd.concat([x_train, x_final_hashed], axis=1)


x_test_hashed = pd.concat([hashing_trick(x_test_hash[f],n_features=best_d)
                                for f in cols_feat_hashing], axis=1)
x_test_final = pd.concat([x_test, x_test_hashed], axis=1)

scaler = StandardScaler() # creates the scaler
scaler.fit(x_final)
x_final = scaler.transform(x_final)
x_test_final = scaler.transform(x_test_final)
clf = LogisticRegression(random_state=13, C=1).fit(x_final, y_train_val)
y_hat = clf.predict_proba(x_test_final)[:,1]
roc_auc_score(y_test, y_hat)

0.7656804381005821

# 1.5
Since we are hashing, there is always a chance at collisions. However since our dimension was the largest of our tested dimensions, I think it has the relatively least amount of collisions since it has the most space and therefore my thoughts on selecting this dimension do not change. 
# 2

In [13]:
def reg_target_encoding(train, col, target):
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
    new_col = f'{col}_mean_enc'
    train[new_col] = np.zeros(train.shape[0])
    global_mean = train.target.mean()
    for train_index, test_index in kfold.split(train, np.zeros(train.shape[0])):
        mean_col = train.loc[train_index].groupby(col).target.mean()
        train[new_col].loc[test_index] = train[col].loc[test_index].map(mean_col)
    train[new_col].fillna(global_mean, inplace=True)
    return pd.DataFrame(train)

In [14]:
def mean_encoding_test(test, train, col, target):
    new_col = f'{col}_mean_enc'
    mean_col = train.groupby(col).target.mean()
    global_mean = train.target.mean()
    train[new_col] = train[col].map(mean_col)
    test[new_col] = train[col].map(mean_col)
    train[new_col].fillna(global_mean, inplace=True)
    test[new_col].fillna(global_mean, inplace=True)
    return pd.DataFrame(test), pd.DataFrame(train)

In [15]:
indicies = np.random.permutation(len(df))
split_point = len(indicies)*0.8
training_last = df.loc[indicies[:int(split_point)]].reset_index(drop=True)
testing_last = df.loc[indicies[int(split_point):]].reset_index(drop=True)
temp11, temp12 = mean_encoding_test(testing_last, training_last, 'ord_2', 'target') # test, train, col, target
temp2 = reg_target_encoding(training_last, 'ord_2', 'target') # train, col, target
temp31, temp32 = mean_encoding_test(testing_last, training_last, 'nom_2', 'target')
temp4 = reg_target_encoding(training_last, 'nom_2', 'target')

In [16]:
temp12.groupby('ord_2').ord_2_mean_enc.value_counts(), temp11.groupby('ord_2').ord_2_mean_enc.value_counts()

(ord_2        ord_2_mean_enc
 Boiling Hot  0.361933           9870
              0.362337           9847
              0.361386           9662
              0.363751           9643
              0.361360           9590
 Cold         0.258673           5445
              0.257777           5428
              0.257036           5424
              0.257394           5423
              0.257413           5343
 Freezing     0.226046          16110
              0.226013          16021
              0.224603          15866
              0.224314          15864
              0.225709          15860
 Hot          0.324862           3647
              0.326029           3575
              0.328170           3546
              0.327147           3535
              0.324897           3510
 Lava Hot     0.403880          10313
              0.402143          10256
              0.404932          10219
              0.404480          10203
              0.403650          10146
 Warm         0.28827

In [17]:
temp2.groupby('ord_2').ord_2_mean_enc.value_counts()

ord_2        ord_2_mean_enc
Boiling Hot  0.361933           9870
             0.362337           9847
             0.361386           9662
             0.363751           9643
             0.361360           9590
Cold         0.258673           5445
             0.257777           5428
             0.257036           5424
             0.257394           5423
             0.257413           5343
Freezing     0.226046          16110
             0.226013          16021
             0.224603          15866
             0.224314          15864
             0.225709          15860
Hot          0.324862           3647
             0.326029           3575
             0.328170           3546
             0.327147           3535
             0.324897           3510
Lava Hot     0.403880          10313
             0.402143          10256
             0.404932          10219
             0.404480          10203
             0.403650          10146
Warm         0.288273           3204
          

In [18]:
temp31.groupby('nom_2').nom_2_mean_enc.value_counts(), temp32.groupby('nom_2').nom_2_mean_enc.value_counts()

(nom_2    nom_2_mean_enc
 Axolotl  0.293489          2320
          0.335958          1211
          0.308423          1091
          0.244738           875
          0.318722           874
          0.361210           746
 Cat      0.293489          3351
          0.335958          1667
          0.308423          1527
          0.318722          1253
          0.244738          1250
          0.361210           990
 Dog      0.293489          2583
          0.335958          1205
          0.308423          1172
          0.244738           937
          0.318722           846
          0.361210           722
 Hamster  0.293489          1973
          0.335958           957
          0.308423           945
          0.244738           724
          0.318722           701
          0.361210           583
 Lion     0.293489          6995
          0.335958          3365
          0.308423          3018
          0.244738          2553
          0.318722          2414
          0.361210

In [19]:
temp4.groupby('nom_2').nom_2_mean_enc.value_counts()

nom_2    nom_2_mean_enc
Axolotl  0.317985           5889
         0.317288           5835
         0.320329           5821
         0.319814           5743
         0.318190           5731
Cat      0.336594           8031
         0.334112           7934
         0.335036           7914
         0.337526           7899
         0.336522           7843
Dog      0.245856           6030
         0.245367           6023
         0.242637           6009
         0.245001           5975
         0.244831           5942
Hamster  0.360801           4818
         0.363627           4766
         0.362939           4700
         0.359560           4692
         0.359138           4628
Lion     0.292341          16378
         0.293996          16276
         0.294352          16215
         0.293904          16140
         0.292852          16023
Snake    0.309028           7492
         0.308507           7394
         0.309153           7300
         0.308387           7295
         0.307045  