In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np
from scipy.sparse.linalg import svds
from functools import partial
import xgboost as xgb
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, roc_auc_score, log_loss
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
import catboost as cb
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

import scikitplot as skplt

pd.set_option('display.max_columns', None)

In [2]:
import pandas
import numpy as np
import catboost as cb
from sklearn.model_selection import KFold
from paramsearch import paramsearch
from itertools import product,chain

# Approximate SVD on data with missing values via expectation-maximization

In [3]:
def emsvd(Y, k=None, tol=1E-3, maxiter=None):
    """
    Inputs:
    -----------
    Y:          (nobs, ndim) data matrix, missing values denoted by NaN/Inf
    k:          number of singular values/vectors to find (default: k=ndim)
    tol:        convergence tolerance on change in trace norm
    maxiter:    maximum number of EM steps to perform (default: no limit)

    Returns:
    -----------
    Y_hat:      (nobs, ndim) reconstructed data matrix
    mu_hat:     (ndim,) estimated column means for reconstructed data
    U, s, Vt:   singular values and vectors (see np.linalg.svd and 
                scipy.sparse.linalg.svds for details)
    """

    if k is None:
        svdmethod = partial(np.linalg.svd, full_matrices=False)
    else:
        svdmethod = partial(svds, k=k)
    if maxiter is None:
        maxiter = np.inf

    # initialize the missing values to their respective column means
    mu_hat = np.nanmean(Y, axis=0, keepdims=1)
    valid = np.isfinite(Y)
    Y_hat = np.where(valid, Y, mu_hat)

    halt = False
    ii = 1
    v_prev = 0

    while not halt:

        # SVD on filled-in data
        U, s, Vt = svdmethod(Y_hat - mu_hat)

        # impute missing values
        Y_hat[~valid] = (U.dot(np.diag(s)).dot(Vt) + mu_hat)[~valid]

        # update bias parameter
        mu_hat = Y_hat.mean(axis=0, keepdims=1)

        # test convergence using relative change in trace norm
        v = s.sum()
        if ii >= maxiter or ((v - v_prev) / v_prev) < tol:
            halt = True
        ii += 1
        v_prev = v

    return Y_hat, mu_hat, U, s, Vt

# Function block

In [4]:
def read_data():
    train = pd.read_csv('school-21-student-expulsion-prediction/train.csv', \
                        parse_dates=['Birth date', 'Contract termination date'], \
                        )
    test = pd.read_csv('school-21-student-expulsion-prediction/test.csv', \
                      parse_dates=['Birth date', 'Contract termination date'])
    waves = pd.read_csv('school-21-student-expulsion-prediction/waves_dates.csv', \
                       parse_dates=['date'])
    submission = pd.read_csv('school-21-student-expulsion-prediction/sample_submission.csv')
    train = train.rename(columns={"Birth date": "birth_date", "Native city": "native_city", \
                     "Wave id": "wave_id", "Heard about school from": "heard_from", \
                     "Life status": "life_status", "Contract termination date": "contract_end", \
                     "Memory entrance game": "memory_game", "Logic entrance game": "logic_game"})
    test = test.rename(columns={"Birth date": "birth_date", "Native city": "native_city", \
                     "Wave id": "wave_id", "Heard about school from": "heard_from", \
                     "Life status": "life_status", "Contract termination date": "contract_end", \
                     "Memory entrance game": "memory_game", "Logic entrance game": "logic_game"})
    print('Data has been read')
    return train, test, waves, submission

def age(df):
    df['age'] = 0
    now = dt.datetime.now()
    for i in range(0, len(df)):
        diff = now - df.birth_date[i]
        df['age'][i] = diff.days // 365
    df = df.drop('birth_date', axis=1)
    return df

def moscow(df):
    df['native_city'].str.strip()
    df['moscow'] = 0
    for i in range(0, len(df)):
        if df['native_city'][i] == 'Москва' or df['native_city'][i] == 'Moscow':
            df['moscow'][i] = 1
    del df['native_city']
    return df

def gender(df):
    df['Gender'] = df['Gender'].apply(lambda x: 1 if x == 'male' else 0)
    return df

def svd(df):
    features = ['day_00', 'day_01', 'day_02', 'day_03',
       'day_04', 'day_05', 'day_06', 'day_07', 'day_08', 'day_09', 'day_10',
       'day_11', 'day_12', 'day_13', 'evalexpr', 'match_n_match', 'bsq',
       'rush_00', 'rush_01', 'rush_02', 'exam_00', 'exam_01', 'exam_02',
       'exam_final']
    matrix = df[features].to_numpy()
    matrix_svd, mu_hat, U, s, Vt = emsvd(matrix)
    df[features] = matrix_svd.astype(int)
    return df

def days_study(df):
    df['days_study'] = 0
    for i in range(0, len(df)):
        diff = df.contract_end[i] - df.date[i]
        df['days_study'][i] = diff.days
    df = df.drop(['date', 'contract_end'], axis=1)
    return df

def merge_waves(df, waves):
    df = pd.merge(df, waves, left_on='wave_id', right_on='id').drop('id_y', axis=1)
    return df

def f(x):
    if x < 0:
        return -1
    else:
        return x

def pipeline(df, train=False):
    df = age(df)
    df = moscow(df)
    df = gender(df)
    df = svd(df)
    df = merge_waves(df, waves)
    df = days_study(df)
    if train:
        df = pd.get_dummies(df, columns=['wave_id', 'heard_from', 'life_status'])
    else:
        df = pd.get_dummies(df, columns=['wave_id', 'heard_from', 'life_status'])
    for column in df:
        df[column] = df[column].apply(f)
    return df

# Data preparation

In [None]:
train, test, waves, submission = read_data()

Data has been read


In [None]:
train_df = pipeline(train, True).iloc[:, 1:]
test_df = pipeline(test, False)
train_df

Unnamed: 0,Gender,Level,day_00,day_01,day_02,day_03,day_04,day_05,day_06,day_07,day_08,day_09,day_10,day_11,day_12,day_13,evalexpr,match_n_match,bsq,rush_00,rush_01,rush_02,exam_00,exam_01,exam_02,exam_final,memory_game,logic_game,contract_status,age,moscow,days_study,wave_id_1,wave_id_2,wave_id_3,wave_id_4,heard_from_facebook_ads,heard_from_from_21_student,heard_from_from_friends,heard_from_from_parents,heard_from_from_teachers,heard_from_google_ads,heard_from_other,heard_from_vk_ads,heard_from_yandex_ads,life_status_other,life_status_study,life_status_work
0,1,7.463235,5,12,0,80,0,1,0,15,10,0,50,21,20,55,81,0,0,0,0,0,75,43,100,44,9,11,1,30,0,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
1,1,4.272727,0,0,15,5,85,3,50,40,0,12,40,5,0,0,0,60,0,120,0,0,80,38,0,45,7,12,1,51,0,,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
2,1,4.256684,0,0,0,0,0,0,0,0,0,26,0,14,0,30,0,0,0,0,42,0,70,16,33,36,10,10,1,20,1,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,6.470833,70,70,50,50,30,5,100,60,40,21,40,58,0,40,0,60,0,120,10,0,60,48,80,45,8,18,1,33,1,,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1
4,1,3.551515,10,10,35,30,5,1,70,40,0,9,20,0,0,0,0,0,50,10,0,0,50,32,33,18,8,11,0,24,0,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1055,0,0.477876,0,0,15,17,20,0,0,0,0,0,25,5,0,30,0,36,0,51,10,0,0,11,0,22,8,6,1,21,0,,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
1056,1,3.369697,7,12,0,30,50,0,70,10,15,0,50,19,20,0,0,0,0,100,0,0,60,38,59,31,8,10,1,31,0,,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
1057,0,3.563636,0,0,35,10,85,1,0,0,0,0,50,5,0,25,0,0,0,100,0,0,65,59,64,35,9,10,1,18,0,,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
1058,1,0.265487,10,0,0,10,30,0,20,0,0,3,0,0,0,0,0,0,0,10,0,0,15,16,0,9,8,6,1,28,0,,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1


In [None]:
X_TEST = test_df.iloc[:, 1:]
X_TEST

Unnamed: 0,Gender,Level,day_00,day_01,day_02,day_03,day_04,day_05,day_06,day_07,day_08,day_09,day_10,day_11,day_12,day_13,evalexpr,match_n_match,bsq,rush_00,rush_01,rush_02,exam_00,exam_01,exam_02,exam_final,memory_game,logic_game,age,moscow,days_study,wave_id_1,wave_id_2,wave_id_3,wave_id_4,heard_from_facebook_ads,heard_from_from_21_student,heard_from_from_friends,heard_from_from_parents,heard_from_from_teachers,heard_from_google_ads,heard_from_other,heard_from_vk_ads,heard_from_yandex_ads,life_status_other,life_status_study,life_status_work
0,1,8.834416,10,12,80,40,30,5,70,10,40,0,85,21,0,20,100,0,100,120,0,0,75,43,64,45,10,12,27,0,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
1,1,4.310160,5,0,0,5,10,-1,35,20,5,9,35,21,10,10,0,60,0,0,0,75,0,6,33,9,7,10,31,0,,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
2,1,4.791444,0,70,15,30,5,0,0,0,0,9,0,17,0,0,0,0,0,100,0,0,0,16,38,36,8,13,27,0,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,5.476415,5,12,0,5,0,2,100,0,10,18,0,5,27,30,100,100,0,0,0,0,95,0,75,31,7,11,21,0,,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0
4,1,4.657754,30,60,15,7,0,0,0,0,0,11,0,0,0,0,0,0,0,0,7,0,0,22,0,13,8,6,31,0,,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
701,1,4.208556,10,12,0,0,10,5,20,40,0,0,10,5,0,0,0,66,100,5,0,0,0,32,22,22,8,13,26,0,,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
702,0,4.786096,20,15,10,40,20,10,100,20,0,0,50,21,35,25,0,100,0,120,0,0,0,27,80,40,8,7,30,0,,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
703,0,1.703125,50,0,15,10,0,10,35,0,0,24,40,0,0,0,0,37,0,0,7,75,60,16,16,18,7,8,23,0,,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0
704,1,2.475862,40,55,0,0,0,10,0,0,0,19,30,5,0,0,0,60,0,49,0,0,0,27,27,22,7,12,21,1,,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0


In [None]:
X = train_df.drop('contract_status', axis=1).to_numpy()
y = train_df['contract_status'].to_numpy()

# XGBoost classification wirh cross validation

In [None]:
xgb_losses = []
boosters = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, test_size=0.15)
    booster = xgb.XGBClassifier()
    
    scores_boost = cross_val_score(booster, X_train, y_train, cv=5, scoring='roc_auc')
    print('XGB scores =', scores_boost)
    booster.fit(X_train, y_train)
    preds = booster.predict_proba(X_test)
    print('Logloss= ', log_loss(y_test, preds))
    xgb_losses.append(log_loss(y_test, preds))
print('Mean logloss =', np.mean(xgb_losses))
skplt.metrics.plot_roc(y_test, preds, figsize=(10,10))

XGB scores = [0.93769231 0.93883547 0.95192308 0.9644765  0.93189103]
Logloss=  0.2065783666889014
XGB scores = [0.94282051 0.9241453  0.94684829 0.97061966 0.91346154]
Logloss=  0.08724478608194161
XGB scores = [0.92435897 0.95779915 0.95245726 0.97622863 0.88434829]
Logloss=  0.2126812226452905
XGB scores = [0.95871795 0.96047009 0.97622863 0.95940171 0.93910256]
Logloss=  0.2867242362134212
XGB scores = [0.97205128 0.9329594  0.94764957 0.98637821 0.9420406 ]
Logloss=  0.28056118660723706
XGB scores = [0.92641026 0.90090812 0.95112179 0.97863248 0.96233974]
Logloss=  0.25268909324487315
XGB scores = [0.94461538 0.98183761 0.96741453 0.90651709 0.96768162]
Logloss=  0.19861584458783954
XGB scores = [0.95153846 0.92788462 0.96901709 0.91666667 0.95779915]
Logloss=  0.18307574559095208
XGB scores = [0.9325641  0.94524573 0.97195513 0.9599359  0.93482906]
Logloss=  0.10832806573450038
XGB scores = [0.93871795 0.95619658 0.93162393 0.95646368 0.94791667]
Logloss=  0.1324547760231549
Mean

<matplotlib.axes._subplots.AxesSubplot at 0x7f0c0b2a8438>

# CatBoost classification wirh cross validation

In [None]:
cat_losses = []

bestparams = {'depth': 1,
 'iterations': 100,
 'learning_rate': 0.3,
 'l2_leaf_reg': 3,
 'border_count': 200,
 'thread_count': 4,
 'verbose': False}

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, test_size=0.15)
    cat = CatBoostClassifier(**bestparams)
    scores_cat = cross_val_score(cat, X_train, y_train, cv=5, scoring='roc_auc')
    print('Catboost scores =', scores_cat)
    cat.fit(X_train, y_train, verbose=False)
    preds = cat.predict_proba(X_test)
    print('Logloss= ', log_loss(y_test, preds))
    cat_losses.append(log_loss(y_test, preds))
print('Mean logloss =', np.mean(cat_losses))
skplt.metrics.plot_roc(y_test, preds, figsize=(10,10))

# Stacking

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, test_size=0.15)
# lr = LogisticRegression()
booster = xgb.XGBClassifier()
cat = CatBoostClassifier(**bestparams)
eclf = VotingClassifier(estimators=[
         ('xgb', booster), ('catboost', cat)], voting='soft')
eclf = eclf.fit(X, y)
# eclf = eclf.fit(X_train, y_train)
# preds = eclf.predict_proba(X_test)
# print('Logloss= ', log_loss(y_test, preds))
# skplt.metrics.plot_roc(y_test, preds, figsize=(10,10))

In [None]:
preds = eclf.predict_proba(X_TEST.to_numpy())

In [None]:
submission.contract_status = preds[:, 1]
submission

In [None]:
submission.to_csv('sub_3.csv', index=False)

# How to calculate parametrs for CatBoost

In [None]:
params = {'depth':[1,2,3,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200],
#           'ctr_border_count':[50,5,10,20,100,200],
          'thread_count':4}

def crossvaltest(params,train_set,train_label,cat_dims,n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True) 
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index,:]
        test = train_set.iloc[test_index,:]

        labels = train_label.iloc[train_index]
        test_labels = train_label.iloc[test_index]

        clf = cb.CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels), cat_features=cat_dims)

        res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
    return np.mean(res)

def catboost_param_tune(params,train_set,train_label,cat_dims=None,n_splits=3):
    ps = paramsearch(params)
    # search 'border_count', 'l2_leaf_reg' etc. individually 
    #   but 'iterations','learning_rate' together
    for prms in chain(ps.grid_search(['border_count']),
#                       ps.grid_search(['ctr_border_count']),
                      ps.grid_search(['l2_leaf_reg']),
                      ps.grid_search(['iterations','learning_rate']),
                      ps.grid_search(['depth'])):
        res = crossvaltest(prms,train_set,train_label,cat_dims,n_splits)
        # save the crossvalidation result so that future iterations can reuse the best parameters
        ps.register_result(res,prms)
        print(res,prms,'best:',ps.bestscore(),ps.bestparam())
    return ps.bestparam()

bestparams = catboost_param_tune(params, X, y)