In [1]:
#79############################################################################
#72#####################################################################
import numpy as np
import pandas as pd
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.utils import shuffle
from datetime import datetime

pd.set_option('display.max_columns', None)

In [2]:
filepath = ('C:/Users/andre/OneDrive/Desktop/GitHub/Health-AI/'
            'icr-identify-age-related-conditions/')

In [3]:
train   = pd.read_csv(filepath+'train.csv')
test    = pd.read_csv(filepath+'test.csv')
greeks  = pd.read_csv(filepath+'greeks.csv')
example = pd.read_csv(filepath+'sample_submission.csv')


def solve_whitespace(df):
    # The files have whitespace issues
    renamer={}
    for col in df.columns:
        renamer[col]=col.strip()
    df.rename(columns=renamer,inplace=True)
    return df
train = solve_whitespace(train)
test  = solve_whitespace(test)

In [4]:
# Exploring missing values

train_summary = pd.DataFrame(train[1:].dtypes, columns=['data type'])
train_summary['missing'] = train.isnull().sum().values 
desc = pd.DataFrame(train.describe(include='all').transpose())
train_summary[train_summary['missing']!=0]

Unnamed: 0,data type,missing
BQ,float64,60
CB,float64,2
CC,float64,3
DU,float64,1
EL,float64,60
FC,float64,1
FL,float64,1
FS,float64,2
GL,float64,1


In [5]:
def prep_df(df):
    # Move categorical col to beginning of df
    cols = list(df.columns)
    cols.remove('EJ')
    cols.insert(1,'EJ') # Temporarily I will remove this
    df['EJ']=df['EJ'].replace({'A':0,'B':1})
    df=df[cols]
    
    df=df.set_index('Id').copy(deep=True)
    
    """
    7 columns have 1, 2, or 3 null values. If I try to find insights
    from those, I will certainly overfit. I am simply imputing those
    values. Due to my concerns of overfitting due to the tiny dataset,
    I am going to only impute those using the mean/mode. Fancier methods
    risk more overfitting. Columns BQ and EL each have 60 nulls (53 of
    those overlap). For these columns I will create new columns that
    mark them as having had nulls and then impute the missing values.
    """
    df['BQnull'] = 0
    df.loc[df['BQ'].isnull(),'BQnull'] = 1
    df['ELnull'] = 0
    df.loc[df['EL'].isnull(),'ELnull'] = 1
    # Impute values with mean and mode
    for col in df.columns:
        if col in {'Class','Id','BQnull','ELnull'}: continue
        if col=='EJ':
            df[col].fillna(train[col].mode()[0],inplace=True)
        else:
            df[col].fillna(train[col].mean(),inplace=True)

    return df
train=prep_df(train)
test=prep_df(test)

In [6]:
def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    balanced_log_loss_score = (-w0/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - w1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / (w0+w1)
    return balanced_log_loss_score

def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred)

In [None]:
foldCount=3

train = shuffle(train, random_state=1)
# Separate into input variables and target variable.
x = train.drop(columns='Class').copy(deep=True)
y = train['Class'].copy(deep=True)
del train

rowsPerFold=len(x)//foldCount
BonusRowsInLastFold=len(x)%foldCount


resultslist=[]
oof_results=[]

for iteration in range(foldCount):
    print('starting iteration',iteration)
   
    # Choosing data to include in this
    firstRow=iteration*rowsPerFold
    if iteration+1==foldCount:
        rowJustPastFold=firstRow+rowsPerFold+BonusRowsInLastFold
    else: rowJustPastFold=firstRow+rowsPerFold
    currentx = pd.concat([x.iloc[:firstRow],x.iloc[rowJustPastFold:]])
    currenty = pd.concat([y.iloc[:firstRow],y.iloc[rowJustPastFold:]])

    # Build Model
    model = XGBClassifier(objective='binary:logistic',
                          eval_metric='logloss',
                          random_state=1,
                          verbosity=1,
                          n_jobs=-1,
                          learning_rate= 0.005, 
                          max_depth=4,
                          colsample_bytree= 0.50,
                          subsample= 0.80,
                          eta= 0.03,
                          gamma= 1.5)
    model.fit(currentx, currenty, verbose=1)

    # Score Out of fold (oof)
    for predictedVal in model.predict_proba(
        x.iloc[firstRow:rowJustPastFold])[:,1]:
        oof_results.append(predictedVal)

    # Score Test Dataset
    """
    Predict_proba returns a 2 dimensional array, but I will just keep the
    probabilities of class_1. Hence [:,1]
    """
    results = model.predict_proba(test)[:,1]
    resultslist.append(results)


# Average the predictions on the test dataset and store it
results = []
for pos in range(len(resultslist[0])):
    prediction=0
    for listX in range(foldCount):
        prediction+=resultslist[listX][pos]
    prediction=prediction/foldCount
    results.append(prediction)

In [None]:
submission = test.copy(deep=True)
submission['class_1'] = results
submission['class_0'] = [1 - x for x in results]
submission=submission[['class_0','class_1']]

# Save the out of fold predictions on the train dataset
x['Class']=oof_results
x=x['Class']
x.sort_index(inplace=True)

In [None]:
save_output=False

if save_output:
    outputFileName = 'v2'
    outputFileName += (' ' + str(datetime.today())[:22].replace(':','.'))
    x.to_csv(outputFileName + ' ooftrain.csv')
    submission.to_csv(outputFileName + '.csv')
    
submission