In [None]:
## imports

import numpy as np
import pandas as pd

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
pd.set_option('display.max_columns', 100)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.preprocessing import LabelEncoder
seed =45
% matplotlib inline
plt.style.use('fivethirtyeight')

In [None]:
### read data
PATH='F:/AV/WNS'
train_csv = 'train_LZdllcl.csv'
test_csv = 'test_2umaH9m.csv'
submit_csv = 'sample_submission_M0L0uXE.csv'

### read train, test and submission files
train = pd.read_csv(f'{PATH}/{train_csv}')
test = pd.read_csv(f'{PATH}/{test_csv}')
submission = pd.read_csv(f'{PATH}/{submit_csv}')

print("Shape of {}:{} {}:{} {}:{}".format('train',train.shape,'test',test.shape,'submission',submission.shape))

In [None]:
### inspect data

train.head()

In [None]:
### class balance

train['is_promoted'].value_counts()

## so approx 10% of past employees have been promoted

In [None]:
### lets check if there is any repeat in employees

len(train['employee_id'].unique()) == train.shape[0]

### so all IDs are unique

In [None]:
## null values

null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

## so 2 columns have null values

In [None]:
### inspect null value columns
a = train[(train.education.isnull())]
_ = train[(train.education.isnull() | train.previous_year_rating.isnull())]

print(a.shape,_.shape)

In [None]:
set(a.index).intersection(set(_.index)) == set(a.index)

### so everywhere where education is not present prev year rating is also not present, but vice-versa is not true

In [None]:
_.head(10)

In [None]:
### check avg % of people promoted with NA in previous ye rating vs without NA

print(np.mean(_['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

In [None]:
### check avg % of people promoted with NA in education vs without NA

print(np.mean(a['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

In [None]:
## For education we will use unknown for all missing values and 9999 for prev year training

train['education'] = train.education.fillna('unknown')
train['previous_year_rating'] = train.previous_year_rating.fillna(9999)

test['education'] = test.education.fillna('unknown')
test['previous_year_rating'] = test.previous_year_rating.fillna(9999)

In [None]:
train.head(12)

In [None]:
### merge dataframes for ease of processing
Y = train['is_promoted'].values
train.drop('is_promoted',inplace=True,axis=1)
train['train'] = 'train'
test['train'] = 'test'
merged = pd.concat([train,test])
merged.shape

In [None]:
cat_cols = [i for i in merged.columns if merged[i].dtypes == 'object']+['KPIs_met >80%','awards_won?']
cat_cols

In [None]:
cat_cols.remove('train')
print(cat_cols)

In [None]:
def pre_process(df,cat_cols):
    one_hot_encoded_training_predictors = pd.get_dummies(df[cat_cols])
    df.drop(cat_cols,inplace=True,axis=1)
    _ = pd.concat([df,one_hot_encoded_training_predictors],1)
    new_tr, new_tst = _[_['train']=='train'],_[_['train']=='test']
    new_tr.drop('train',inplace=True,axis=1)
    new_tst.drop('train',inplace=True,axis=1)
    return new_tr, new_tst

In [None]:
train_OHE,test_OHE = pre_process(merged,cat_cols)

In [None]:
### nearest neightbour features
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

def make_unsupervised_knn_HC(N_NEIGHBORS):
    """создает колонки с расстоянием от исходной точки до k-го соседа"""
    #df = df.drop("TARGET",1)
    model = NearestNeighbors(n_neighbors=N_NEIGHBORS, algorithm = 'ball_tree',n_jobs = -1) # ball tree works faster, for better results use 'auto'
    model.fit(train_OHE)
    k_distances, indices = model.kneighbors(train_OHE)
    k_distances_test, indices_test = model.kneighbors(test_OHE)

    for i in tqdm(range(1, N_NEIGHBORS)):
        print(i)
        train_OHE["dist_{}_neigh".format(i)] = k_distances[:, i]
        test_OHE["dist_{}_neigh".format(i)] = k_distances_test[:, i]
#    df.to_csv("knn_dataset.csv")
    return train_OHE,test_OHE

In [None]:
train_OHE,test_OHE = make_unsupervised_knn_HC(500)

In [None]:
train_OHE.shape

In [None]:
### define X, Y
X_cols = [i for i in train_OHE.columns]
X_cols.remove('employee_id')

In [None]:
from sklearn.metrics import f1_score

In [None]:
#Grid Search
logreg = LogisticRegression(class_weight='balanced')
param = {'C':[0.001,0.01,0.1,1,10,100]}
clf = GridSearchCV(logreg,param,scoring='f1',refit=True,cv=10,verbose = 1)
clf.fit(train_OHE[X_cols],Y)
print('Best F1: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_))

In [None]:
best_c = clf.best_params_['C']
print(best_c)

In [None]:
# train_OHE = train_OHE.reset_index()
# test_OHE = test_OHE.reset_index()

In [None]:
def cutoff_predict_own(pred,cutoff):
    return ((pred>cutoff).astype(int))

def cutoff_predict(clf,X,cutoff):
    return ((clf.predict_proba(X)[:,1]>cutoff).astype(int))

def custom_f1(y,pred,cutoff):
    ypred = cutoff_predict_own(pred,cutoff)
    scr = sklearn.metrics.f1_score(y,ypred)
    return scr

In [None]:
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=seed)
pred_test_full =0
cv_score =[]
i=1
X = train_OHE[X_cols]
y = Y
x_test = test_OHE[X_cols]
custom = []
holdy = []
holdout = []
for train_index,test_index in kf.split(X,y):
    print('{} of KFold {}'.format(i,kf.n_splits))
    xtr,xvl = X.iloc[train_index],X.iloc[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    #model
    lr = LogisticRegression(C=int(best_c))
    lr.fit(xtr,ytr)
    _ = lr.predict(xvl)
    temp1 = lr.predict_proba(xvl)[:,1]
    holdout.append(list(temp1))
    holdy.append(list(yvl))

    ##score
    score = sklearn.metrics.f1_score(yvl,_)
    print('F1:',score)
    cv_score.append(score)    
    pred_test = lr.predict_proba(x_test)[:,1]
    pred_test_full +=pred_test
#    print(len(holdout))

 
    if i == 5:
        holdout = [item for sublist in holdout for item in sublist]
        holdy = [item for sublist in holdy for item in sublist]
        print("Running finetune for threshold")
        for cutoff in np.arange(0.2,0.5,0.05):
            custom_scr = custom_f1(holdy,holdout, cutoff)
            custom.append({'fld':i,'cutoff':np.round(cutoff,2),'custom':np.round(custom_scr,4)})
            
    i+=1

In [None]:
custom

In [None]:
pred_test_full /= 5

In [None]:
from datetime import datetime

In [None]:
def write_submit(predictions,modelname='LogReg_knn_v0'):
    submission['is_promoted'] = predictions
    _ = datetime.now().strftime('%Y%m%d%H%M%S')
    Fname = 'F:/AV/WNS/submission/'+str(modelname)+'_'+str(_)+'.csv'
    print(Fname)
    submission.to_csv(Fname,index=False)

In [None]:
write_submit(pred_test_full,modelname='LogReg_knn_v0')