In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from math import sin,log,pow,cos
import lightgbm as lgb
import datetime
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

import gc
pd.options.mode.chained_assignment = None

In [2]:
def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

class Importance_calculator:
    def __init__(self,X,y,param_list,num_round=1000,metric=roc_auc_score,cv=5,random_state=0,modeltype='logit',scale=True):
        self.X=X
        self.y=y
        self.cv=cv
        self.scale=scale
        self.param_list=param_list
        self.metric=metric
        self.num_round=num_round
        self.random_state=random_state
        self.modeltype=modeltype
    def scorer(self,y_true,y_pred):
        return(self.metric(y_true,y_pred))
  
    def permutate_column_predict(self,model,valid_x,valid_y):
        perm_pred = []
        np.random.seed(self.random_state)
        for col in tqdm_notebook(valid_x.columns):
            value = valid_x[col].copy()
            valid_x[col] = np.random.permutation(valid_x[col].values)
            perm_pred=perm_pred+[self.scorer(valid_y,self.pred_wrapper(model,valid_x,self.modeltype))] #predict
            valid_x[col] = value
        return(perm_pred)
    
    def pred_wrapper(self,model,x,modeltype='logit'):
        if modeltype is 'logit':
            return(model.predict_proba(x)[:,1])
        else:
            return(model.predict(x))
        
    def scaler(self,train,valid):
        train_mean , train_std = train.mean(axis=0),train.std(axis=0)
        #rescale inside the cycle to not overfit
        train-=train_mean
        valid-=train_mean

        train/=train_std
        valid/=train_std
        return(train,valid)
    
    def cv_score_importance(self):
        N=self.X.shape[1]
        folds = StratifiedKFold(n_splits=self.cv, shuffle=True,random_state=self.random_state)
        print('Inizio train e scoring:\n')
        self.importance_permutation_score=[0]*N
        for trn_idx, val_idx in tqdm_notebook(folds.split(self.X, self.y)):
                train_x, train_y = self.X.iloc[trn_idx], self.y.iloc[trn_idx]
                valid_x, valid_y = self.X.iloc[val_idx], self.y.iloc[val_idx]
                if self.scale is True:
                    train_x,valid_x=self.scaler(train_x,valid_x)
                    
                print('Inizio train.\n')
#                 model = lgb.train(self.param_list,lgb.Dataset(train_x, label=train_y),self.num_round)
                model = LogisticRegression(**self.param_list).fit(train_x, train_y) 
                print('inizio calcolo permutation.\n')
                perm_pred = self.permutate_column_predict(model,valid_x,valid_y)

                base_pred = [self.scorer(valid_y,self.pred_wrapper(model,valid_x,self.modeltype))] * N
                tmp_diff=[base_pred[i]-perm_pred[i] for i in range(N)]
                self.importance_permutation_score=[self.importance_permutation_score[i]+tmp_diff[i] for i in range(N)]
        return([self.importance_permutation_score[i]/np.float(self.cv) for i in range(N)])
    
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def siner(x,period):
    return(sin(2*np.pi*x/period))
def coser(x,period):
    return(cos(2*np.pi*x/period))

def transformer():
    path = '../input/cat-in-the-dat/'

    train = pd.read_csv(f'{path}train.csv')
    test = pd.read_csv(f'{path}test.csv')
    sample_submission = pd.read_csv(f'{path}sample_submission.csv')

    #find test column with unseen values 
    difference_set = [len(set(test[col].values) - set(train[col].values)) for col in test.columns]
    difference_set = test.columns[[x>0 and x<1000 for x in difference_set]].tolist()
    
    #create dictionary with value of unseen data
    difference_values = {col: {'value': set(test[col].values) - set(train[col].values)} for col in difference_set}
    
    #replace it with most frequent value in training set
    for col in difference_values:
        test.loc[test[col].isin(difference_values[col]['value']),col]=train[col].value_counts().idxmax()
    
    drop_col = ['id','target']
    target=train['target']
    
    train = train.drop(drop_col, axis=1)
    test = test.drop(['id'], axis=1)
    
    original_columns=train.columns
    
    #divide feature in 4 different list , for future transformation
    tar_en_list = [col for col in train.columns if col[:3] in ['bin','nom','ord','day','mon']]
    one_hot_list = [col for col in train.columns if (col[:3] in ['bin','nom','ord','day','mon']) and train[col].nunique()<20] #feature with too many values will have only target encoding
    ord_en_list = [col for col in train.columns if col[:3] in ['ord']]
    cyclical_list = [col for col in train.columns if col[:3] in ['day','mon']]
    
    Colonne = list(set(tar_en_list+one_hot_list+ord_en_list+cyclical_list))

    train[Colonne] = train[Colonne].astype(object)
    test[Colonne] = test[Colonne].astype(object)

    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    print('Starting Mean Target Encoding\n')
    gc.collect()

    for f in tqdm_notebook(tar_en_list):
        gc.collect()
        train[f+'_mean_target']=0
        test[f+'_mean_target']=0
        for trn_idx, val_idx in folds.split(train.values, target.values):
            gc.collect()
            trn_f, trn_tgt = train[f].iloc[trn_idx], target.iloc[trn_idx]
            val_f, val_tgt = train[f].iloc[val_idx], target.iloc[val_idx]
            trn_tf, val_tf = target_encode(trn_series=trn_f, 
                                             tst_series=val_f, 
                                             target=trn_tgt, 
                                             min_samples_leaf=50, 
                                             smoothing=5,
                                             noise_level=0)
            train.loc[val_idx,f+'_mean_target']=val_tf
        gc.collect()
        trn_tf, val_tf = target_encode(trn_series=train[f], 
                                     tst_series=test[f], 
                                     target=target, 
                                     min_samples_leaf=50, 
                                     smoothing=5,
                                     noise_level=0)
        test[f+'_mean_target']=val_tf
    
    print('Starting One Hot Encoding\n')
    train = pd.concat([train,pd.get_dummies(train[one_hot_list],prefix_sep='_',columns=one_hot_list)],axis=1)
    test = pd.concat([test,pd.get_dummies(test[one_hot_list],prefix_sep='_',columns=one_hot_list)],axis=1)

    gc.collect()
    
    print('Starting Ordinal Encoding\n')

    mapper_ord_1 = {'Novice': 1, 'Contributor': 2, 'Expert': 3, 'Master': 4, 'Grandmaster': 5}
    mapper_ord_2 = {'Freezing': 1, 'Cold': 2, 'Warm': 3, 'Hot': 4,'Boiling Hot': 5, 'Lava Hot': 6}
    mapper_ord_3 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 
                    'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15}
    mapper_ord_4 = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 
                    'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15,
                    'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 
                    'W': 23, 'X': 24, 'Y': 25, 'Z': 26}
    for col, mapper in zip(['ord_1', 'ord_2', 'ord_3', 'ord_4'], [mapper_ord_1, mapper_ord_2, mapper_ord_3, mapper_ord_4]):
        train[col+'_oe'] = train[col].replace(mapper)
        test[col+'_oe'] = test[col].replace(mapper)
        
    ord_5 = sorted(list(set(train['ord_5'].values)))
    ord_5 = dict(zip(ord_5, range(len(ord_5))))
    train['ord_5_oe'] = train['ord_5'].apply(lambda x: ord_5[x]).astype(int).values
    test['ord_5_oe'] = test['ord_5'].apply(lambda x: ord_5[x]).astype(int).values
    
    print('Starting Cyclical Encoding\n')

    train[cyclical_list] = train[cyclical_list].astype(float)
    for col in cyclical_list:
        period=train[col].max()
        train[col+'_cos']=train[col].transform(coser,period=period)
        train[col+'_cos']=train[col].transform(siner,period=period)
        test[col+'_cos']=test[col].transform(coser,period=period)
        test[col+'_cos']=test[col].transform(siner,period=period)

    Colonne_drop = [x for x in original_columns if train[x].nunique()>100]
    Colonne = [x for x in original_columns if train[x].nunique()<=100]
    
    train = train.drop(Colonne_drop, axis=1)
    test = test.drop(Colonne_drop, axis=1)

    train[Colonne] = train[Colonne].astype('category')
    test[Colonne] = train[Colonne].astype('category')
    original_columns=Colonne

#     gc.collect()
    return([train,test,target,original_columns,sample_submission])

In [3]:
train,test, y,columns , sub=transformer()

Starting Mean Target Encoding



HBox(children=(IntProgress(value=0, max=23), HTML(value='')))


Starting One Hot Encoding

Starting Ordinal Encoding

Starting Cyclical Encoding



In [4]:
train = train.drop(columns, axis=1)
test = test.drop(columns, axis=1)


In [5]:
class Negative_Sampler:
  
  def __init__(self,vector,prob_pos=1,prob_neg=.5,seed=0):
    self.prob_pos=prob_pos
    self.prob_neg=prob_neg
    self.vector=vector
    self.seed=0
    
  def negative_sample(self):
    np.random.seed(self.seed)
    Positive = np.where(self.vector==1)[0]
    Negative = np.where(self.vector==0)[0]
    Positive_sample= np.random.choice(Positive,np.int(np.round((1-self.prob_pos)*len(Positive))),replace=False).tolist()
    Negative_sample= np.random.choice(Negative,np.int(np.round((1-self.prob_neg)*len(Negative))),replace=False).tolist()
    result=np.sort(Positive_sample+Negative_sample)
    return(result)
indexer=Negative_Sampler(vector=y,prob_pos=.2,prob_neg=.2)
index_neg=indexer.negative_sample()

In [6]:
gc.collect()
train_new , y_new = train.iloc[index_neg,:] , y.iloc[index_neg]

In [7]:
# n_fold=5
# seed=1
# grid1 = {'penalty':['l1'],'C':2**np.arange(-4,0,1,dtype=float),'max_iter':[1000],'solver':['saga'],'n_jobs':[4]}
# grid2 = {'penalty':['elasticnet'],'C':2**np.arange(-4,0,1,dtype=float),'max_iter':[1000],'solver':['saga'],'n_jobs':[4],
#          'l1_ratio':np.linspace(0,1,num=5,endpoint=False)[1:]}
# grid3 = {'penalty':['l2'],'C':2**np.arange(-4,0,1,dtype=float),'max_iter':[1000],'solver':['lbfgs'],'n_jobs':[4]}
# parameter_grid =(list(ParameterGrid(grid1))+list(ParameterGrid(grid2))+list(ParameterGrid(grid3)))

# error_mean=list()
# error_std=list()

# def grid_cv_error(param,train,y,folds,scale=True):
#     score = []
#     print('Starting cv - training\n')
#     for trn_idx, val_idx in tqdm_notebook(folds.split(train, y)):
#         train_x, train_y = train.iloc[trn_idx,:], y.iloc[trn_idx]
#         valid_x, valid_y = train.iloc[val_idx,:], y.iloc[val_idx]
#         if scale:
#             train_mean , train_std = train_x.mean(axis=0),train_x.std(axis=0)
#             #rescale inside the cycle to not overfit
#             train_x-=train_mean
#             valid_x-=train_mean

#             train_x/=train_std
#             valid_x/=train_std
#         #sparsing data to mae it faster
#         train_x,valid_x = csr_matrix(train_x),csr_matrix(valid_x)
        
#         model = LogisticRegression(**param).fit(train_x,train_y)
#         score = score+[roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])]
#     return(np.mean(score),np.std(score))

# for param in tqdm_notebook(parameter_grid):
#     folds = StratifiedKFold(n_splits=n_fold, shuffle=True,random_state=seed)
#     cv_mean,cv_std=grid_cv_error(param,train_new,y_new,folds)
#     print("AUC-ROC : {} {} {}\n".format(cv_mean,u"\u00B1", cv_std))
#     error_mean=error_mean+[cv_mean]
#     error_std=error_std+[cv_std]

# del train_new,y_new
# gc.collect()

# best_param = parameter_grid[np.argmax([error_mean[x]-error_std[x] for x in range(len(error_mean))])]
# best_param['max_iter']=10000
# cv_mean,cv_std=grid_cv_error(best_param,train,y,folds)

# print("tuned hpyerparameters : {} ".format(best_param))
# print("Final AUC-ROC : {} {} {}".format(cv_mean,u"\u00B1", cv_std))
best_param={'C':1,'max_iter':100,'solver':'lbfgs','n_jobs':4}


In [8]:
gc.collect()
seed=1
ensemble=pd.DataFrame({'Start':sub['target']})
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True,random_state=seed)
score=[]
iteration=0
for trn_idx, val_idx in tqdm_notebook(folds.split(train, y)):
    test_copy=test.copy()
    gc.collect()
    train_x, train_y = train.iloc[trn_idx,:], y.iloc[trn_idx]
    valid_x, valid_y = train.iloc[val_idx,:], y.iloc[val_idx]
    
    train_mean , train_std = train_x.mean(axis=0),train_x.std(axis=0)
    
    #rescale inside the cycle to not overfit
    train_x-=train_mean
    valid_x-=train_mean
    test_copy-=train_mean

    train_x/=train_std
    valid_x/=train_std
    test_copy/=train_std
    
    model = LogisticRegression(**best_param)
    model.fit(train_x, train_y)
    
    Pred=model.predict_proba(valid_x)[:,1]
    ensemble['Iteration_'+str(iteration)]=model.predict_proba(test_copy)[:,1]
    score = score+[roc_auc_score(valid_y,Pred)]
    
    print('Cv-Error : {}'.format(roc_auc_score(valid_y,Pred)))
    
    del Pred
    gc.collect()
    iteration+=1

Pred=ensemble.iloc[:,1:]
sub['target']=Pred.mean(axis=1)
sub.to_csv('submission_ensemble.csv', index=False)



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Cv-Error : 0.7989458355423107
Cv-Error : 0.7987498291859807
Cv-Error : 0.8033767528095088
Cv-Error : 0.8006005975887474
Cv-Error : 0.7986251049292128

