In [9]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc,roc_auc_score, accuracy_score, confusion_matrix, f1_score, precision_score, \
recall_score, matthews_corrcoef, precision_recall_curve
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import PolynomialFeatures,LabelEncoder
from sklearn.feature_selection import VarianceThreshold
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, f1_score, precision_score, \
recall_score, matthews_corrcoef, precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
import lightgbm as lgb

In [2]:
#like jitter function of Tableau
def add_noise(series, noise_level):#Noise Level should be a Proportion (0-1, 0.01 means 1%)
    return series * (1 + noise_level * np.random.randn(len(series)))

In [5]:
def target_encode(trn_series=None,
                  val_series=None,
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=100, 
                  smoothing=10,
                  noise_level=0):
    """
   This handles missing values as just another value and computes the encoding like it does for any other value of the variables
   The parameter smoothing provides control over the slope of the function around the inflexion point.
   The parameter k determines half of the minimal sample size for which we completely "trust" the estimate based on the sample
   in the cell. OR minimum samples to take category average into account
    """
    
    assert len(trn_series) == len(target) #used for debugging, can dislay a error message when error
    assert trn_series.name == tst_series.name #checking if they are the same variable
    assert val_series.name== tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    
    #FOR TEST
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    
    
    #FOR VAL
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(val_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level),add_noise(ft_tst_series, noise_level)

In [None]:
def printScore(pred, label):
    result={}
    result['auc'] = roc_auc_score(label, pred)
    result['accuracy'] = accuracy_score(label, pred)
    result['precision'] = precision_score(label, pred)
    result['recall'] = recall_score(label, pred)
    result['f1'] = f1_score(label, pred) 
    #result['confusion'] = confusion_matrix(label, pred)
    precision_c, recall_c, _ = precision_recall_curve(label, pred)
    result['pr_auc_score'] = auc(recall_c, precision_c)
    return result

In [5]:
def frequency_encode(trn_df,
                  val_df,
                  tst_df,
                     col_names):
    result_train_df=pd.DataFrame()
    result_val_df=pd.DataFrame()
    result_test_df=pd.DataFrame()
    
    for i in col_names: 
        freq=pd.DataFrame(trn_df[i].value_counts())
        freq.reset_index(inplace=True)
        freq.columns=[i,i+'_freq'] 
        #left join freq with the col in train
        train_df_temp=trn_df[[i]].reset_index(drop=True)
        val_df_temp=val_df[[i]].reset_index(drop=True)
        test_df_temp=tst_df[[i]].reset_index(drop=True)
        temp_df_train=pd.merge(train_df_temp,freq,how='left',on=[i])
        temp_df_train.drop(i,inplace=True,axis=1)
        #in val and test
        temp_df_val=pd.merge(val_df_temp,freq,how='left',on=[i])
        temp_df_val.drop(i,inplace=True,axis=1)
        temp_df_test=pd.merge(test_df_temp,freq,how='left',on=[i])
        temp_df_test.drop(i,inplace=True,axis=1)
        #if any new levels present in test/val fill freq as 0
        temp_df_val.fillna(0,inplace=True)
        temp_df_test.fillna(0,inplace=True) 
        
        if result_train_df.shape[0]==0:
            result_train_df=temp_df_train
            
            result_val_df=temp_df_val
            result_test_df=temp_df_test
        else:
            result_train_df=pd.concat([result_train_df,temp_df_train],axis=1)
            
            result_val_df=pd.concat([result_val_df,temp_df_val],axis=1)
            result_test_df=pd.concat([result_test_df,temp_df_test],axis=1)      

    return result_train_df,result_val_df,result_test_df

In [72]:
def binary_encode(trn_df,val_df,tst_df,col_name):
    #first label encode the column
    train=trn_df[[col_name]]
    val=val_df[[col_name]]
    test=tst_df[[col_name]]
    lbl = LabelEncoder()
    lbl.fit(list(train[col_name].values)+list(val[col_name].values)+list(test[col_name].values))
    train['LE']= lbl.transform(list(train[col_name].values))
    val['LE']= lbl.transform(list(val[col_name].values))
    test['LE']= lbl.transform(list(test[col_name].values))
    global_max=max(max(train['LE']),max(val['LE']),max(test['LE']))
    ## work out how many digtis required to be represent max_dev in binary representation
    max_bin_len = len("{0:b}".format(global_max))
    train['BE']=train['LE'].apply(lambda x: "{0:b}".format(x).zfill(max_bin_len))
    splitted_tr = train['BE'].apply(lambda x: pd.Series(list(x)).astype(np.uint8))
    splitted_tr.columns = [col_name+'BE' + '_bin_' + str(x) for x in splitted_tr.columns]
    train=train.join(splitted_tr)
    train.drop([col_name,'BE','LE'],axis=1,inplace=True)
    #val
    val['BE']=val['LE'].apply(lambda x: "{0:b}".format(x).zfill(max_bin_len))    
    splitted_val = val['BE'].apply(lambda x: pd.Series(list(x)).astype(np.uint8))    
    splitted_val.columns = [col_name+'BE' + '_bin_' + str(x) for x in splitted_val.columns]
    val=val.join(splitted_val)
    val.drop([col_name,'BE','LE'],axis=1,inplace=True)
    #test
    test['BE']=test['LE'].apply(lambda x: "{0:b}".format(x).zfill(max_bin_len))
    splitted_test = test['BE'].apply(lambda x: pd.Series(list(x)).astype(np.uint8))
    splitted_test.columns = [col_name+'BE' + '_bin_' + str(x) for x in splitted_test.columns]
    test=test.join(splitted_test)    
    test.drop([col_name,'BE','LE'],axis=1,inplace=True)
    train.reset_index(drop=True,inplace=True)
    val.reset_index(drop=True,inplace=True)
    test.reset_index(drop=True,inplace=True)
       
    return train,val,test


In [73]:
def probability_to_rank(prediction, scaler=1):
    """
    
    """
    pred_df=pd.DataFrame(columns=['probability'])
    pred_df['probability']=prediction
    pred_df['rank']=pred_df['probability'].rank()/len(prediction)*scaler
    return pred_df['rank'].values