In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from math import sin,log,pow,cos
import lightgbm as lgb
import datetime
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from scipy.sparse import csr_matrix,coo_matrix, hstack
from bayes_opt import BayesianOptimization
import warnings

import gc
pd.options.mode.chained_assignment = None


In [2]:
def choice(train,target,col,label,min_samples_leaf=20,smoothing=1,coo=True):
    if label == 'one_hot':
        X = one_hot_encoder(train,col)
    if label == 'label':
        X = label_encoder(train,col).reshape((-1,1))
    if label == 'mean_target':
        X = mean_target(train,target,col,min_samples_leaf,smoothing).reshape((-1,1))
    if label == 'siner':
        X = cyclical_siner_encoder(train,col).reshape((-1,1))
    if label == 'coser':
        X = cyclical_coser_encoder(train,col).reshape((-1,1))
    if label == 'ordinal':
        X =  ordinal_encoder(train,col).reshape((-1,1))
    if coo:
        return(csr_matrix(X))
    else:
        return(X)
    
def one_hot_encoder(train,col):
    return(pd.get_dummies(train[col],prefix_sep='_',columns=col,drop_first=True))

def label_encoder(train,col):
    lbl_enc = LabelEncoder()
    return(lbl_enc.fit_transform(train[col].values))

def cyclical_coser_encoder(train,col):
    period = train[col].max()
    return((train[col].astype(float)).transform(coser,period=period).values)

def cyclical_siner_encoder(train,col):
    period = train[col].max()
    return((train[col].astype(float)).transform(coser,period=period).values)

def ordinal_encoder(train,col):
    if col == 'ord_0':
        return(order0)
    if col=='ord_1':
        return(order1)
    if col=='ord_2':
        return(order2)
    if col=='ord_3':
        return(order3)
    if col=='ord_4':
        return(order4)
    if col=='ord_5':
        return(order5)

def mean_target(train,target,col,min_samples_leaf,smoothing):
        vector = np.zeros(len(train[col]))
        X = train.iloc[:train_index,:].copy().reset_index()
        X2 = train.iloc[train_index:,:].copy().reset_index()
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        for trn_idx, val_idx in folds.split(X.values, target.values):
            trn_f, trn_tgt = X[col].iloc[trn_idx], target.iloc[trn_idx]
            val_f, val_tgt = X[col].iloc[val_idx], target.iloc[val_idx]
            trn_tf, val_tf = target_encode(trn_series=trn_f, 
                                             tst_series=val_f, 
                                             target=trn_tgt, 
                                             min_samples_leaf=min_samples_leaf , 
                                             smoothing=smoothing ,
                                             noise_level=0)
            vector[val_idx]=val_tf
        trn_tf, val_tf = target_encode(trn_series=X[col], 
                             tst_series=X2[col], 
                             target=target, 
                             min_samples_leaf=min_samples_leaf, 
                             smoothing=smoothing,
                             noise_level=0)
        vector[train_index:]=val_tf
        return(vector)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in tqdm_notebook(df.columns):
        gc.collect()
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def siner(x,period):
    return(sin(2*np.pi*x/period))
def coser(x,period):
    return(cos(2*np.pi*x/period))


In [3]:
Encoding = pd.read_csv('../input/transformer-selector-ord/results.csv',header=None,index_col=0)

Encoding = pd.Series(Encoding.values.flatten(),index=Encoding.index,dtype='str')
Encoding[Encoding!='one_hot']='one_hot'
Encoding

0
bin_0    one_hot
bin_1    one_hot
bin_2    one_hot
bin_3    one_hot
bin_4    one_hot
nom_0    one_hot
nom_1    one_hot
nom_2    one_hot
nom_3    one_hot
nom_4    one_hot
nom_5    one_hot
nom_6    one_hot
nom_7    one_hot
nom_8    one_hot
nom_9    one_hot
ord_0    one_hot
ord_1    one_hot
ord_2    one_hot
ord_3    one_hot
ord_4    one_hot
ord_5    one_hot
day      one_hot
month    one_hot
dtype: object

In [4]:
path = '../input/cat-in-the-dat/'

train = pd.read_csv(f'{path}train.csv')
test = pd.read_csv(f'{path}test.csv')
sub = pd.read_csv(f'{path}sample_submission.csv')

#find test column with unseen values 
difference_set = [len(set(test[col].values) - set(train[col].values)) for col in test.columns]
difference_set = test.columns[[x>0 and x<1000 for x in difference_set]].tolist()

#create dictionary with value of unseen data
difference_values = {col: {'value': set(test[col].values) - set(train[col].values)} for col in difference_set}

#replace it with most frequent value in training set
for col in difference_values:
    test.loc[test[col].isin(difference_values[col]['value']),col]=train[col].value_counts().idxmax()

drop_col = ['id','target']
target=train['target']

train = train.drop(drop_col, axis=1)
test = test.drop(['id'], axis=1)


In [5]:
trainall = pd.concat([train,test],axis=0)
train_index = train.shape[0]

In [6]:
params = {
    'C': 0.16750619378151424,
    'min_bin_4': int(999.278226929541),
    'min_nom_2': int(998.2264797985416),
    'min_nom_8': int(478.6201714813315),
    'min_nom_9': int(932.4114630159186),
    'min_ord_1': int(998.7815846789609),
    'min_ord_5': int(929.8586375729484),
    'smoot_bin_4': int(853.6768869921463),
    'smoot_nom_2': int(955.674930086958),
    'smoot_nom_8': int(818.8237255821915),
    'smoot_nom_9': int(627.6620227252247),
    'smoot_ord_1': int(857.6148881185935),
    'smoot_ord_5': int(192.23171998520255)}

In [7]:
for N in range(Encoding.shape[0]):
    gc.collect()
    col = Encoding.index[N]
    label = Encoding[N]
    if N == 0:
        Matrix = choice(train = trainall,target = target,col = col,label = label,coo=True)
    else:
        if label == 'bin_4':
            Matrix = hstack([Matrix,choice(train = trainall,target = target,col = col,label = label,min_samples_leaf=params['min_bin_4'],
                                           smoothing=params['smoot_bin_4'],coo=True)],format='csr')
        if label == 'nom_2':
            Matrix = hstack([Matrix,choice(train = trainall,target = target,col = col,label = label,min_samples_leaf=params['min_nom_2'],
                                           smoothing=params['smoot_nom_2'],coo=True)],format='csr')
        if label == 'nom_9':
            Matrix = hstack([Matrix,choice(train = trainall,target = target,col = col,label = label,
                                           min_samples_leaf=params['min_nom_9'],smoothing=params['smoot_nom_9'],coo=True)],format='csr')
        if label == 'ord_1':
            Matrix = hstack([Matrix,choice(train = trainall,target = target,col = col,label = label,
                                           min_samples_leaf=params['min_ord_1'],smoothing=params['smoot_ord_1'],coo=True)],format='csr')
        if label == 'ord_5':
            Matrix = hstack([Matrix,choice(train = trainall,target = target,col = col,label = label,
                                           min_samples_leaf=params['min_ord_5'],smoothing=params['smoot_ord_5'],coo=True)],format='csr')
        if label not in ['bin_4','nom_2','nom_9','ord_1','ord_5']:
            Matrix = hstack([Matrix,choice(train = trainall,target = target,col = col,label = label,coo=True)],format='csr')

Matrix_train = Matrix[:train_index,:].copy()
Matrix_test = Matrix[train_index:,:].copy()

param={'C':params['C'],'max_iter':10000,'solver':'lbfgs','n_jobs':4}
score=0
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True,random_state=0)
pred = np.zeros(sub.shape[0])
for fold_ , (trn_idx, val_idx) in tqdm_notebook(enumerate(folds.split(Matrix_train, target))):
        train_x, train_y = Matrix_train[trn_idx,:], target[trn_idx]
        valid_x, valid_y = Matrix_train[val_idx,:], target[val_idx]
        model = LogisticRegression(**param, random_state=0)
        model.fit(train_x, train_y)
        pred += (model.predict_proba(Matrix_test)[:,1])/n_fold
        score += roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])/n_fold
        print('Fold: {}; AUC-ROC: {:.3f}\n'.format(fold_,roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])))
print('Final CV Score: {:.3f}'.format(score))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Fold: 0; AUC-ROC: 0.803

Fold: 1; AUC-ROC: 0.803

Fold: 2; AUC-ROC: 0.804

Fold: 3; AUC-ROC: 0.804

Fold: 4; AUC-ROC: 0.802


Final CV Score: 0.803


In [8]:
sub['target']=pred
sub.to_csv('submission_ensemble.csv', index=False)