In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from math import sin,log,pow,cos
import lightgbm as lgb
import datetime
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from scipy.sparse import csr_matrix,coo_matrix, hstack

import gc
pd.options.mode.chained_assignment = None


In [2]:
def choice(train,target,col,label,coo=True):
    if label == 'one_hot':
        X = one_hot_encoder(train,col)
    if label == 'label':
        X = label_encoder(train,col).reshape((-1,1))
    if label == 'mean_target':
        X = mean_target(train,target,col).reshape((-1,1))
    if label == 'siner':
        X = cyclical_siner_encoder(train,col).reshape((-1,1))
    if label == 'coser':
        X = cyclical_coser_encoder(train,col).reshape((-1,1))
    if label == 'ordinal':
        X =  ordinal_encoder(train,col).reshape((-1,1))
    if coo:
        return(csr_matrix(X))
    else:
        return(X)
    
def one_hot_encoder(train,col):
    return(pd.get_dummies(train[col],prefix_sep='_',columns=col,drop_first=True))

def label_encoder(train,col):
    lbl_enc = LabelEncoder()
    return(lbl_enc.fit_transform(train[col].values))

def cyclical_coser_encoder(train,col):
    period = train[col].max()
    return((train[col].astype(float)).transform(coser,period=period).values)

def cyclical_siner_encoder(train,col):
    period = train[col].max()
    return((train[col].astype(float)).transform(coser,period=period).values)

def ordinal_encoder(train,col):
    if col == 'ord_0':
        return(order0)
    if col=='ord_1':
        return(order1)
    if col=='ord_2':
        return(order2)
    if col=='ord_3':
        return(order3)
    if col=='ord_4':
        return(order4)
    if col=='ord_5':
        return(order5)

def mean_target(train,target,col):
        vector = np.zeros(len(train[col]))
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        for trn_idx, val_idx in folds.split(train.values, target.values):
            trn_f, trn_tgt = train[col].iloc[trn_idx], target.iloc[trn_idx]
            val_f, val_tgt = train[col].iloc[val_idx], target.iloc[val_idx]
            trn_tf, val_tf = target_encode(trn_series=trn_f, 
                                             tst_series=val_f, 
                                             target=trn_tgt, 
                                             min_samples_leaf=200, 
                                             smoothing=20,
                                             noise_level=0)
            vector[val_idx]=val_tf
            vector
        return(vector)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in tqdm_notebook(df.columns):
        gc.collect()
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def siner(x,period):
    return(sin(2*np.pi*x/period))
def coser(x,period):
    return(cos(2*np.pi*x/period))


In [3]:
path = '../input/cat-in-the-dat/'

train = pd.read_csv(f'{path}train.csv')
test = pd.read_csv(f'{path}test.csv')

#find test column with unseen values 
difference_set = [len(set(test[col].values) - set(train[col].values)) for col in test.columns]
difference_set = test.columns[[x>0 and x<1000 for x in difference_set]].tolist()

#create dictionary with value of unseen data
difference_values = {col: {'value': set(test[col].values) - set(train[col].values)} for col in difference_set}

#replace it with most frequent value in training set
for col in difference_values:
    test.loc[test[col].isin(difference_values[col]['value']),col]=train[col].value_counts().idxmax()

drop_col = ['id','target']
target=train['target']

train = train.drop(drop_col, axis=1)
test = test.drop(['id'], axis=1)


In [4]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True,random_state=10)
category = ['ord_0','ord_1','ord_2','ord_3','ord_4','ord_5'] 
best_param={'C':1,'max_iter':100,'solver':'lbfgs','n_jobs':4}

Result_Matrix = pd.DataFrame(np.zeros((len(category),6)),columns = ['one_hot','label','siner','coser','mean_target','ordinal'],index=category)

Result_Matrix

Unnamed: 0,one_hot,label,siner,coser,mean_target,ordinal
ord_0,0.0,0.0,0.0,0.0,0.0,0.0
ord_1,0.0,0.0,0.0,0.0,0.0,0.0
ord_2,0.0,0.0,0.0,0.0,0.0,0.0
ord_3,0.0,0.0,0.0,0.0,0.0,0.0
ord_4,0.0,0.0,0.0,0.0,0.0,0.0
ord_5,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
Stage2Encoding = pd.read_csv('../input/transformer-selector-round2/results.csv',header=None,index_col=0)

Stage2Encoding = pd.Series(Stage2Encoding.values.flatten(),index=Stage2Encoding.index,dtype='str')

In [6]:
train_minification = train[category].copy()
gc.collect()

ord_en_list = [col for col in train.columns if col[:3] in ['ord']]
mapper_ord_1 = {'Novice': 1, 'Contributor': 2, 'Expert': 3, 'Master': 4, 'Grandmaster': 5}
mapper_ord_2 = {'Freezing': 1, 'Cold': 2, 'Warm': 3, 'Hot': 4,'Boiling Hot': 5, 'Lava Hot': 6}
mapper_ord_3 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 
                'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15}
mapper_ord_4 = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 
                'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15,
                'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 
                'W': 23, 'X': 24, 'Y': 25, 'Z': 26}

ord_5 = sorted(list(set(train['ord_5'].values)))
ord_5 = dict(zip(ord_5, range(len(ord_5))))

order0 = train['ord_0'].values
order1 = train['ord_1'].replace(mapper_ord_1).values
order2 = train['ord_2'].replace(mapper_ord_2).values
order3 = train['ord_3'].replace(mapper_ord_3).values
order4 = train['ord_4'].replace(mapper_ord_4).values
order5 = train['ord_5'].apply(lambda x: ord_5[x]).astype(int).values

In [7]:
for N in range(Stage2Encoding.shape[0]):
    gc.collect()
    col = Stage2Encoding.index[N]
    label = Stage2Encoding[N]
    if N == 0:
        pre_score_Matrix = choice(train = train,target = target,col = col,label = label,coo=True)
    else:
        pre_score_Matrix = hstack([pre_score_Matrix,choice(train = train,target = target,col = col,label = label,coo=True)],format='csr')


In [8]:
class encoder_finder:
    
    def __init__(self,X,target,toadd,Base_Encoding,Matrix_transform,fold,n_fold,param):
        self.X=X
        self.target=target
        self.Base_Encoding=Base_Encoding
        self.Matrix_transform=Matrix_transform
        self.fold=fold
        self.param=param
        self.n_fold=n_fold
        self.toadd=toadd

    def matrix_sparse_calculator(self,Encoding_list):
        gc.collect()
        for N in range(Encoding_list.shape[0]):
            gc.collect()
            col = Encoding_list.index[N]
            label = Encoding_list[N]
            if N == 0:
                Matrix = choice(train = self.X,target = self.target,col = col,label = label,coo=True)
            else:
                Matrix = hstack([Matrix,choice(train = self.X,target = self.target,col = col,label = label,coo=True)],format='csr')
        Matrix = hstack([Matrix,self.toadd],format='csr')
        gc.collect()
        return(Matrix)
    
    def cv_score(self,Matrix):
        gc.collect()
        score = 0
        for trn_idx, val_idx in self.fold.split(Matrix, self.target):
            gc.collect()
            train_x, train_y = Matrix[trn_idx,:], self.target[trn_idx]
            valid_x, valid_y = Matrix[val_idx,:], self.target[val_idx]
            model = LogisticRegression(**self.param, random_state=0)
            model.fit(train_x, train_y)
            score += roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])/n_fold
        return(score)
    
    def encoder_result(self):
        self.Score_Matrix = self.Matrix_transform.copy()
        variable = self.Matrix_transform.index.tolist()
        transformer = self.Matrix_transform.columns
        for col in tqdm_notebook(variable):
            for encoder in tqdm_notebook(transformer):
                gc.collect()
                if self.Matrix_transform.loc[col,encoder]!= (-999): 
                    New_Encoder = self.Base_Encoding.copy()
                    New_Encoder.loc[col] = encoder
                    self.Score_Matrix.loc[col,encoder] = self.cv_score(self.matrix_sparse_calculator(New_Encoder))
                else:
                    self.Score_Matrix.loc[col,encoder] = 0
                print('Variable : {} ; Encoder : {} ; Score : {}'.format(col,encoder,self.Score_Matrix.loc[col,encoder]))


In [9]:
gc.collect()
matrix_transform =  pd.DataFrame(np.zeros((len(category),4)),columns = ['one_hot','label','mean_target','ordinal'],index=category)

n_fold = 5
fold = StratifiedKFold(n_splits=n_fold, shuffle=True,random_state=0)
param={'C':1,'max_iter':100,'solver':'lbfgs','n_jobs':4}
Base_Encoding =  matrix_transform.idxmax(axis=1)
# Base_Encoding['ord_5'] = 'ordinal'


In [10]:
#0.7991324056697438
encode = encoder_finder(X = train,target = target,toadd=pre_score_Matrix,
                        Base_Encoding = Base_Encoding, Matrix_transform = matrix_transform,fold = fold,n_fold = n_fold,param = param)
encode.encoder_result()


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

Variable : ord_0 ; Encoder : one_hot ; Score : 0.79913257863904
Variable : ord_0 ; Encoder : label ; Score : 0.7991403019296219
Variable : ord_0 ; Encoder : mean_target ; Score : 0.7990168258712724
Variable : ord_0 ; Encoder : ordinal ; Score : 0.7990708004730651



HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

Variable : ord_1 ; Encoder : one_hot ; Score : 0.79913257863904
Variable : ord_1 ; Encoder : label ; Score : 0.7869781802985379
Variable : ord_1 ; Encoder : mean_target ; Score : 0.799139801605109
Variable : ord_1 ; Encoder : ordinal ; Score : 0.7987637404499914



HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

Variable : ord_2 ; Encoder : one_hot ; Score : 0.79913257863904
Variable : ord_2 ; Encoder : label ; Score : 0.7827903929043312
Variable : ord_2 ; Encoder : mean_target ; Score : 0.7991088002946342
Variable : ord_2 ; Encoder : ordinal ; Score : 0.79815870315291



HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

Variable : ord_3 ; Encoder : one_hot ; Score : 0.79913257863904
Variable : ord_3 ; Encoder : label ; Score : 0.793785566080256
Variable : ord_3 ; Encoder : mean_target ; Score : 0.7989349878638392
Variable : ord_3 ; Encoder : ordinal ; Score : 0.793336098252372



HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

Variable : ord_4 ; Encoder : one_hot ; Score : 0.79913257863904
Variable : ord_4 ; Encoder : label ; Score : 0.7912101983785467
Variable : ord_4 ; Encoder : mean_target ; Score : 0.7990530668288942
Variable : ord_4 ; Encoder : ordinal ; Score : 0.7871952862330966



HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

Variable : ord_5 ; Encoder : one_hot ; Score : 0.79913257863904
Variable : ord_5 ; Encoder : label ; Score : 0.7694366138116553
Variable : ord_5 ; Encoder : mean_target ; Score : 0.7991754648093088
Variable : ord_5 ; Encoder : ordinal ; Score : 0.7694366138116553




In [11]:
Encode_Sup = encode.Score_Matrix.idxmax(axis=1)
Encode_Sup

ord_0          label
ord_1    mean_target
ord_2        one_hot
ord_3        one_hot
ord_4        one_hot
ord_5    mean_target
dtype: object

In [12]:
%%time

Encode_Sup = encode.Score_Matrix.idxmax(axis=1)

encode2 = encoder_finder(X = train,target = target,toadd=pre_score_Matrix,
                        Base_Encoding = Base_Encoding, Matrix_transform = matrix_transform,fold = fold,n_fold = n_fold,param = param)
score = encode2.cv_score(encode2.matrix_sparse_calculator(Encode_Sup))
gc.collect()
print('Final - Encoding : {}'.format(score))


Final - Encoding : 0.7991633365459915
CPU times: user 4.8 s, sys: 200 ms, total: 5 s
Wall time: 34.6 s


In [13]:
Score_Sup = encode.Score_Matrix.max(axis=1)
Score_Sup

ord_0    0.799140
ord_1    0.799140
ord_2    0.799133
ord_3    0.799133
ord_4    0.799133
ord_5    0.799175
dtype: float64

In [14]:
Score_Sup[Encode_Sup !=Base_Encoding]

ord_0    0.799140
ord_1    0.799140
ord_5    0.799175
dtype: float64

In [15]:
cv = 0.7990981313897078
encoder = encoder_finder(X = train,target = target,toadd=pre_score_Matrix,
                        Base_Encoding = Base_Encoding, Matrix_transform = matrix_transform,fold = fold,n_fold = n_fold,param = param)
order = ['ord_5','ord_1','ord_0']
Final_Encoding = Base_Encoding.copy()
print('Start testing\n')
for col in order:
    print('Starting column {}\n'.format(col))
    temp_encode = Final_Encoding.copy()
    temp_encode[col] = Encode_Sup[col]
    score = encoder.cv_score(encoder.matrix_sparse_calculator(temp_encode))
    gc.collect()
    if score>cv:
        cv=score
        Final_Encoding[col] = Encode_Sup[col]
        print('Column {} - CV : {}'.format(col,score))


Start testing

Starting column ord_5

Column ord_5 - CV : 0.7991754648093088
Starting column ord_1

Column ord_1 - CV : 0.7992055085314815
Starting column ord_0



In [16]:
Final_Encoding_All = pd.Series('',index=train.columns)

In [17]:
temp = Final_Encoding.append(Stage2Encoding)
for i in range(Final_Encoding_All.shape[0]):
    Final_Encoding_All[i] = temp.loc[Final_Encoding_All.index[i]]
Final_Encoding_All['nom_8'] = 'mean_target'

In [18]:
Final_Encoding_All.to_csv('results.csv')

  """Entry point for launching an IPython kernel.
