In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from math import sin,log,pow,cos
import lightgbm as lgb
import datetime
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from scipy.sparse import csr_matrix,coo_matrix, hstack

import gc
pd.options.mode.chained_assignment = None


In [2]:
def choice(train,target,col,label,coo=True):
    if label == 'one_hot':
        X = one_hot_ecoder(train,col)
    if label == 'label':
        X = label_encoder(train,col).reshape((-1,1))
    if label == 'mean_target':
        X = mean_target(train,target,col).reshape((-1,1))
    if label == 'siner':
        X = cyclical_siner_encoder(train,col).reshape((-1,1))
    if label == 'coser':
        X = cyclical_coser_encoder(train,col).reshape((-1,1))
    if coo:
        return(csr_matrix(X))
    else:
        return(X)
    
def one_hot_ecoder(train,col):
    return(pd.get_dummies(train[col],prefix_sep='_',columns=col,drop_first=True))

def label_encoder(train,col):
    lbl_enc = LabelEncoder()
    return(lbl_enc.fit_transform(train[col].values))

def cyclical_coser_encoder(train,col):
    period = train[col].max()
    return((train[col].astype(float)).transform(coser,period=period).values)

def cyclical_siner_encoder(train,col):
    period = train[col].max()
    return((train[col].astype(float)).transform(coser,period=period).values)

def mean_target(train,target,col):
        vector = np.zeros(len(train[col]))
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        for trn_idx, val_idx in folds.split(train.values, target.values):
            trn_f, trn_tgt = train[col].iloc[trn_idx], target.iloc[trn_idx]
            val_f, val_tgt = train[col].iloc[val_idx], target.iloc[val_idx]
            trn_tf, val_tf = target_encode(trn_series=trn_f, 
                                             tst_series=val_f, 
                                             target=trn_tgt, 
                                             min_samples_leaf=200, 
                                             smoothing=20,
                                             noise_level=0)
            vector[val_idx]=val_tf
            vector
        return(vector)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in tqdm_notebook(df.columns):
        gc.collect()
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def siner(x,period):
    return(sin(2*np.pi*x/period))
def coser(x,period):
    return(cos(2*np.pi*x/period))

    


In [3]:
path = '../input/cat-in-the-dat/'

train = pd.read_csv(f'{path}train.csv')
test = pd.read_csv(f'{path}test.csv')

#find test column with unseen values 
difference_set = [len(set(test[col].values) - set(train[col].values)) for col in test.columns]
difference_set = test.columns[[x>0 and x<1000 for x in difference_set]].tolist()

#create dictionary with value of unseen data
difference_values = {col: {'value': set(test[col].values) - set(train[col].values)} for col in difference_set}

#replace it with most frequent value in training set
for col in difference_values:
    test.loc[test[col].isin(difference_values[col]['value']),col]=train[col].value_counts().idxmax()

drop_col = ['id','target']
target=train['target']

train = train.drop(drop_col, axis=1)
test = test.drop(['id'], axis=1)


In [4]:
first_to_check = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2',
       'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

In [5]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True,random_state=10)


In [6]:
category = first_to_check+['day','month']

In [7]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True,random_state=0)
best_param={'C':1,'max_iter':100,'solver':'lbfgs','n_jobs':4}

Result_Matrix = pd.DataFrame(np.zeros((len(category),5)),columns = ['one_hot','label','siner','coser','mean_target'],index=category)


for col in tqdm_notebook(first_to_check):
    for transformer in ['one_hot','label','mean_target']:
        score=0
        X = choice(train,target,col,transformer,coo=True)
        for trn_idx, val_idx in folds.split(X, target):
            train_x, train_y = X[trn_idx,:], target[trn_idx]
            valid_x, valid_y = X[val_idx,:], target[val_idx]
            model = LogisticRegression(**best_param, random_state=0)
            model.fit(train_x, train_y)
            score += roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])/n_fold
            
        Result_Matrix.loc[col,transformer]=score

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [8]:
for col in tqdm_notebook(['day', 'month']):
    for transformer in ['one_hot','label','mean_target','siner','coser']:
        score=0
        X = choice(train,target,col,transformer,coo=True)
        for trn_idx, val_idx in folds.split(X, target):
            train_x, train_y = X[trn_idx,:], target[trn_idx]
            valid_x, valid_y = X[val_idx,:], target[val_idx]
            model = LogisticRegression(**best_param, random_state=0)
            model.fit(train_x, train_y)
            score += roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])/n_fold
            
        Result_Matrix.loc[col,transformer]=score

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [9]:
Base_Encoding = Result_Matrix.idxmax(axis=1)

In [10]:
train_minification = train[category].copy()
gc.collect()

235

In [11]:
ord_en_list = [col for col in train.columns if col[:3] in ['ord']]
mapper_ord_1 = {'Novice': 1, 'Contributor': 2, 'Expert': 3, 'Master': 4, 'Grandmaster': 5}
mapper_ord_2 = {'Freezing': 1, 'Cold': 2, 'Warm': 3, 'Hot': 4,'Boiling Hot': 5, 'Lava Hot': 6}
mapper_ord_3 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 
                'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15}
mapper_ord_4 = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 
                'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15,
                'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 
                'W': 23, 'X': 24, 'Y': 25, 'Z': 26}
ordinal_data_train = train[ord_en_list].copy()

for col, mapper in zip(['ord_1', 'ord_2', 'ord_3', 'ord_4'], [mapper_ord_1, mapper_ord_2, mapper_ord_3, mapper_ord_4]):
    ordinal_data_train[col] = ordinal_data_train[col].replace(mapper)

ord_5 = sorted(list(set(ordinal_data_train['ord_5'].values)))
ord_5 = dict(zip(ord_5, range(len(ord_5))))
ordinal_data_train['ord_5'] = ordinal_data_train['ord_5'].apply(lambda x: ord_5[x]).astype(int).values
ordinal_data_train = csr_matrix(ordinal_data_train)

In [12]:
# for col in train_minification.columns:
#     train_minification[col] = label_encoder(train_minification,col)
    
# train_minification = reduce_mem_usage(train_minification)

In [13]:
class encoder_finder:
    
    def __init__(self,X,target,ordinal_data,Base_Encoding,Matrix_transform,fold,n_fold,param):
        self.X=X
        self.target=target
        self.Base_Encoding=Base_Encoding
        self.Matrix_transform=Matrix_transform
        self.fold=fold
        self.param=param
        self.n_fold=n_fold
        self.ordinal_data=ordinal_data

    def matrix_sparse_calculator(self,Encoding_list):
        gc.collect()
        for N in range(Encoding_list.shape[0]):
            gc.collect()
            col = Encoding_list.index[N]
            label = Encoding_list[N]
            if N == 0:
                Matrix = choice(train = self.X,target = self.target,col = col,label = label,coo=True)
            else:
                Matrix = hstack([Matrix,choice(train = self.X,target = self.target,col = col,label = label,coo=True)],format='csr')
        Matrix = hstack([Matrix,self.ordinal_data],format='csr')
        gc.collect()
        return(Matrix)
    
    def cv_score(self,Matrix):
        gc.collect()
        score = 0
        for trn_idx, val_idx in self.fold.split(Matrix, self.target):
            gc.collect()
            train_x, train_y = Matrix[trn_idx,:], self.target[trn_idx]
            valid_x, valid_y = Matrix[val_idx,:], self.target[val_idx]
            model = LogisticRegression(**self.param, random_state=0)
            model.fit(train_x, train_y)
            score += roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])/n_fold
        return(score)
    
    def encoder_result(self):
        self.Score_Matrix = self.Matrix_transform.copy()
        variable = self.Matrix_transform.index.tolist()
        transformer = self.Matrix_transform.columns
        for col in tqdm_notebook(variable):
            for encoder in tqdm_notebook(transformer):
                gc.collect()
                if self.Matrix_transform.loc[col,encoder]!= (-999): 
                    New_Encoder = self.Base_Encoding.copy()
                    New_Encoder.loc[col] = encoder
                    self.Score_Matrix.loc[col,encoder] = self.cv_score(self.matrix_sparse_calculator(New_Encoder))
                else:
                    self.Score_Matrix.loc[col,encoder] = 0
                print('Variable : {} ; Encoder : {} ; Score : {}'.format(col,encoder,self.Score_Matrix.loc[col,encoder]))


In [14]:
gc.collect()
matrix_transform = pd.DataFrame(np.zeros((len(category),5)),columns = ['one_hot','label','siner','coser','mean_target'],index=category)
matrix_transform.loc[:-2,['siner','coser']] = -999

n_fold = 5
fold = StratifiedKFold(n_splits=n_fold, shuffle=True,random_state=0)
param={'C':1,'max_iter':100,'solver':'lbfgs','n_jobs':4}



In [15]:
encode = encoder_finder(X = train_minification,target = target,ordinal_data=ordinal_data_train, Base_Encoding = Base_Encoding,
                        Matrix_transform = matrix_transform,fold = fold,n_fold = n_fold,param = param)
try1 = Base_Encoding.copy()
try1[try1!='one_hot']='one_hot'
M = encode.matrix_sparse_calculator(try1)

In [16]:
score=0
for trn_idx, val_idx in fold.split(M, target):
    train_x, train_y = M[trn_idx,:], target[trn_idx]
    valid_x, valid_y = M[val_idx,:], target[val_idx]
    model = LogisticRegression(**best_param, random_state=0)
    model.fit(train_x, train_y)
    score += roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])/n_fold

del M,try1,model
gc.collect()
print('All one_hot score - Ordinal Not one_hot : {}'.format(score))

All one_hot score - Ordinal Not one_hot : 0.7488514943826494


In [17]:
one_hot_train = (pd.get_dummies(train,prefix_sep='_',columns=train.columns,drop_first=True,sparse=True)).sparse.to_coo().tocsr()
score=0
for trn_idx, val_idx in fold.split(one_hot_train, target):
    train_x, train_y = one_hot_train[trn_idx,:], target[trn_idx]
    valid_x, valid_y = one_hot_train[val_idx,:], target[val_idx]
    model = LogisticRegression(**best_param, random_state=0)
    model.fit(train_x, train_y)
    score += roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])/n_fold

gc.collect()
print('All one_hot score - Ordinal one_hot : {}'.format(score))
del one_hot_train,score,model


All one_hot score - Ordinal one_hot : 0.7974914652290405


In [18]:
ordinal_data_train_one_hot = csr_matrix(one_hot_ecoder(train,ord_en_list))

In [19]:
One_Hot_Encoding = Base_Encoding.copy()
One_Hot_Encoding[One_Hot_Encoding!='one_hot']='one_hot'

In [20]:
%%time

encode = encoder_finder(X = train_minification,target = target,ordinal_data=ordinal_data_train_one_hot,#ordinal_data_train,
                        Base_Encoding = One_Hot_Encoding, Matrix_transform = matrix_transform,fold = fold,n_fold = n_fold,param = param)
encode.encoder_result()

HBox(children=(IntProgress(value=0, max=17), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : bin_0 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : bin_0 ; Encoder : label ; Score : 0.7974921379674105
Variable : bin_0 ; Encoder : siner ; Score : 0.0
Variable : bin_0 ; Encoder : coser ; Score : 0.0
Variable : bin_0 ; Encoder : mean_target ; Score : 0.7978182827562657



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : bin_1 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : bin_1 ; Encoder : label ; Score : 0.7974921379674105
Variable : bin_1 ; Encoder : siner ; Score : 0.0
Variable : bin_1 ; Encoder : coser ; Score : 0.0
Variable : bin_1 ; Encoder : mean_target ; Score : 0.7979068532039078



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : bin_2 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : bin_2 ; Encoder : label ; Score : 0.7974921379674105
Variable : bin_2 ; Encoder : siner ; Score : 0.0
Variable : bin_2 ; Encoder : coser ; Score : 0.0
Variable : bin_2 ; Encoder : mean_target ; Score : 0.7972532305920488



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : bin_3 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : bin_3 ; Encoder : label ; Score : 0.7974921379674105
Variable : bin_3 ; Encoder : siner ; Score : 0.0
Variable : bin_3 ; Encoder : coser ; Score : 0.0
Variable : bin_3 ; Encoder : mean_target ; Score : 0.7973583959352418



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : bin_4 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : bin_4 ; Encoder : label ; Score : 0.7974921379674105
Variable : bin_4 ; Encoder : siner ; Score : 0.0
Variable : bin_4 ; Encoder : coser ; Score : 0.0
Variable : bin_4 ; Encoder : mean_target ; Score : 0.7976982575608349



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_0 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_0 ; Encoder : label ; Score : 0.7965130768850395
Variable : nom_0 ; Encoder : siner ; Score : 0.0
Variable : nom_0 ; Encoder : coser ; Score : 0.0
Variable : nom_0 ; Encoder : mean_target ; Score : 0.7974653025462721



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_1 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_1 ; Encoder : label ; Score : 0.7971204044873147
Variable : nom_1 ; Encoder : siner ; Score : 0.0
Variable : nom_1 ; Encoder : coser ; Score : 0.0
Variable : nom_1 ; Encoder : mean_target ; Score : 0.797282299972031



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_2 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_2 ; Encoder : label ; Score : 0.7964007349114979
Variable : nom_2 ; Encoder : siner ; Score : 0.0
Variable : nom_2 ; Encoder : coser ; Score : 0.0
Variable : nom_2 ; Encoder : mean_target ; Score : 0.7974145787520086



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_3 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_3 ; Encoder : label ; Score : 0.7965206929678864
Variable : nom_3 ; Encoder : siner ; Score : 0.0
Variable : nom_3 ; Encoder : coser ; Score : 0.0
Variable : nom_3 ; Encoder : mean_target ; Score : 0.7971473215845635



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_4 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_4 ; Encoder : label ; Score : 0.7949330818625927
Variable : nom_4 ; Encoder : siner ; Score : 0.0
Variable : nom_4 ; Encoder : coser ; Score : 0.0
Variable : nom_4 ; Encoder : mean_target ; Score : 0.797425695747972



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_5 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_5 ; Encoder : label ; Score : 0.7628095675403195
Variable : nom_5 ; Encoder : siner ; Score : 0.0
Variable : nom_5 ; Encoder : coser ; Score : 0.0
Variable : nom_5 ; Encoder : mean_target ; Score : 0.7971838525706956



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_6 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_6 ; Encoder : label ; Score : 0.7558463817653873
Variable : nom_6 ; Encoder : siner ; Score : 0.0
Variable : nom_6 ; Encoder : coser ; Score : 0.0
Variable : nom_6 ; Encoder : mean_target ; Score : 0.7963704187113076



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_7 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_7 ; Encoder : label ; Score : 0.7534932082697239
Variable : nom_7 ; Encoder : siner ; Score : 0.0
Variable : nom_7 ; Encoder : coser ; Score : 0.0
Variable : nom_7 ; Encoder : mean_target ; Score : 0.7959600810834868



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_8 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_8 ; Encoder : label ; Score : 0.7472031710785982
Variable : nom_8 ; Encoder : siner ; Score : 0.0
Variable : nom_8 ; Encoder : coser ; Score : 0.0
Variable : nom_8 ; Encoder : mean_target ; Score : 0.794389713510551



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : nom_9 ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : nom_9 ; Encoder : label ; Score : 0.7118865222732482
Variable : nom_9 ; Encoder : siner ; Score : 0.0
Variable : nom_9 ; Encoder : coser ; Score : 0.0
Variable : nom_9 ; Encoder : mean_target ; Score : 0.7990541328182172



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : day ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : day ; Encoder : label ; Score : 0.7964059547308286
Variable : day ; Encoder : siner ; Score : 0.797297762946964
Variable : day ; Encoder : coser ; Score : 0.797297762946964
Variable : day ; Encoder : mean_target ; Score : 0.7973883119165577



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Variable : month ; Encoder : one_hot ; Score : 0.7974921379674105
Variable : month ; Encoder : label ; Score : 0.7968991946133119
Variable : month ; Encoder : siner ; Score : 0.7910336004501353
Variable : month ; Encoder : coser ; Score : 0.7910336004501353
Variable : month ; Encoder : mean_target ; Score : 0.7973575440882884


CPU times: user 39min, sys: 2min 10s, total: 41min 10s
Wall time: 1h 7min 20s


In [21]:
print(encode.Score_Matrix)

        one_hot     label     siner     coser  mean_target
bin_0  0.797492  0.797492  0.000000  0.000000     0.797818
bin_1  0.797492  0.797492  0.000000  0.000000     0.797907
bin_2  0.797492  0.797492  0.000000  0.000000     0.797253
bin_3  0.797492  0.797492  0.000000  0.000000     0.797358
bin_4  0.797492  0.797492  0.000000  0.000000     0.797698
nom_0  0.797492  0.796513  0.000000  0.000000     0.797465
nom_1  0.797492  0.797120  0.000000  0.000000     0.797282
nom_2  0.797492  0.796401  0.000000  0.000000     0.797415
nom_3  0.797492  0.796521  0.000000  0.000000     0.797147
nom_4  0.797492  0.794933  0.000000  0.000000     0.797426
nom_5  0.797492  0.762810  0.000000  0.000000     0.797184
nom_6  0.797492  0.755846  0.000000  0.000000     0.796370
nom_7  0.797492  0.753493  0.000000  0.000000     0.795960
nom_8  0.797492  0.747203  0.000000  0.000000     0.794390
nom_9  0.797492  0.711887  0.000000  0.000000     0.799054
day    0.797492  0.796406  0.797298  0.797298     0.7973

In [22]:
%%time

Encode_Sup = encode.Score_Matrix.idxmax(axis=1)

encode2 = encoder_finder(X = train_minification,target = target,ordinal_data=ordinal_data_train_one_hot,
                        Base_Encoding = One_Hot_Encoding, Matrix_transform = matrix_transform,fold = fold,n_fold = n_fold,param = param)
score = encode2.cv_score(encode2.matrix_sparse_calculator(Encode_Sup))
gc.collect()
print('Final - Encoding : {}'.format(score))

Final - Encoding : 0.798940850080226
CPU times: user 18.5 s, sys: 1.02 s, total: 19.6 s
Wall time: 49.2 s


In [23]:
Score_Sup = encode.Score_Matrix.max(axis=1)

In [24]:
cv = 0.7974914652290405

In [25]:
Score_Sup

bin_0    0.797818
bin_1    0.797907
bin_2    0.797492
bin_3    0.797492
bin_4    0.797698
nom_0    0.797492
nom_1    0.797492
nom_2    0.797492
nom_3    0.797492
nom_4    0.797492
nom_5    0.797492
nom_6    0.797492
nom_7    0.797492
nom_8    0.797492
nom_9    0.799054
day      0.797492
month    0.797492
dtype: float64

In [26]:
Encode_Sup

bin_0    mean_target
bin_1    mean_target
bin_2        one_hot
bin_3        one_hot
bin_4    mean_target
nom_0        one_hot
nom_1        one_hot
nom_2        one_hot
nom_3        one_hot
nom_4        one_hot
nom_5        one_hot
nom_6        one_hot
nom_7        one_hot
nom_8        one_hot
nom_9    mean_target
day          one_hot
month        one_hot
dtype: object

In [27]:
Final_Encoding

NameError: name 'Final_Encoding' is not defined

In [28]:
%%time
cv = 0.7974914652290405
encoder = encoder_finder(X = train_minification,target = target,ordinal_data=ordinal_data_train_one_hot,
                        Base_Encoding = One_Hot_Encoding, Matrix_transform = matrix_transform,fold = fold,n_fold = n_fold,param = param)
order = ['nom_9','bin_1','bin_0','bin_4']
Final_Encoding = One_Hot_Encoding.copy()
print('Start testing\n')
for col in order:
    print('Starting column {}\n'.format(col))
    temp_encode = Final_Encoding.copy()
    temp_encode[col] = Encode_Sup[col]
    score = encoder.cv_score(encoder.matrix_sparse_calculator(temp_encode))
    gc.collect()
    if score>cv:
        cv=score
        Final_Encoding[col] = Encode_Sup[col]
        print('Column {} - CV : {}'.format(col,score))

Start testing

Starting column nom_9

Column nom_9 - CV : 0.7990541328182172
Starting column bin_1

Starting column bin_0

Starting column bin_4

Column bin_4 - CV : 0.7990981313897078
CPU times: user 1min 5s, sys: 3.44 s, total: 1min 9s
Wall time: 3min 4s


In [29]:
Final_Encoding.to_csv('results.csv',index=False)

  """Entry point for launching an IPython kernel.
