In [1]:
# Write your code here
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# helper functions

In [15]:
def getageweightgroups():
    
    ageweightgroups = {}
    
    agebins = ['['+str(x)+'-'+str(x+10)+')' for x in np.arange(0,100,10)] 
    agebinvals = np.arange(len(agebins))
    ageweightgroups.update({'age' : dict(zip(agebins,agebinvals))})
    
    weightbins = ['['+str(x)+'-'+str(x+25)+')' for x in np.arange(0,200,25)] 
    weightbinvals = np.arange(len(weightbins))

    ageweightgroups.update({'weight' : dict(zip(weightbins,weightbinvals))})
    
    return ageweightgroups

# function to cleanup / replace bad data and change data types of known columns

def standardize(df):
        
    # replace ? with NaN
    df1 = df.copy()

    # Make all ? as Nan
    df1 = df1.replace('?', np.NaN)

    # make all char/obj type columns upper and strip spaces if any
    for catcol in df1.select_dtypes(include=['object']).columns:
        df1[catcol] = df1[catcol].str.strip()
        df1[catcol] = df1[catcol].str.upper()

    # remove values which have chars in tel9 / tel10 /tel11
    for col in ['tel_9','tel_10','tel_11']:
        df1[col] = df1[col].replace(to_replace ='[\D]', value = np.NaN, regex = True) 
        df1[col] = df1[col].astype(float)



    ageweightgroups = getageweightgroups()
    df1['weight'] = df1['weight'].apply(lambda x:ageweightgroups['weight'][x] if x in ageweightgroups['weight'].keys() else None )
    df1['age']  = df1['age'].apply(lambda x:ageweightgroups['age'][x] if x in ageweightgroups['age'].keys() else None )
 
    return df1

# function to predict a missing value from other records with non-null values
# the variable y_col is predicted based on other records of the columns set x_cols
def fillna_with_predictions(df,x_cols,y_col):

    d = df[x_cols + [y_col]]
    # create one hot encoded features for the X columns
    dum = pd.get_dummies(d, columns = x_cols)
    
    # get the data which has a non-null value of the target var y_col
    masknotnull = dum[y_col].notnull()
    # build features X and y (considering all data)
    X = dum[masknotnull].loc[:,~dum.columns.isin([y_col])]
    y = dum[masknotnull][y_col]

    # fit a RF classifier
    rf = RandomForestClassifier(random_state = 1, n_estimators = 200)
    rf.fit(X,y)

    # predict the data for the rows which has missing y_col
    X = dum[~masknotnull].loc[:,~dum.columns.isin([y_col])]
    pred = rf.predict(X)

    # update the missing y_col data
    dum.loc[~masknotnull,y_col] = pred.reshape(len(pred),1)

    #return them missing column
    return dum[y_col]


# function to do a fillna with a specific type of strategy
def fillna_with_values(df,cols,how):

    df1 = df.copy()
    # impute with string "UNKNOWN"
    if how == 'UNKNOWN':
        df1[cols] = df1[cols].fillna('UNKNOWN')
    # Impute with median of the column
    elif how == 'MEDIAN':
        for c in cols:
            df1[c] = df1[c].fillna(df1[c].median())
    # Impute with median of the column
    elif how == 'MODE':
        for c in cols:
            df1[c] = df1[c].fillna(df1[c].mode().values[0])
    return df1


# function to impute missing data 
# calls the above two functions based on specific case of the column
def impute_data(df):

    df1 = df.copy()
    
    df1 = fillna_with_values(df1,['race'],'UNKNOWN')
    df1 = fillna_with_values(df1,['tel_9', 'tel_10', 'tel_11'],'MEDIAN')
    
    df1['weight'] = fillna_with_predictions(df = df1, x_cols = ['race', 'gender', 'age'] , y_col = 'weight')
    df1['tel_1'] = fillna_with_predictions(df = df1, x_cols = ['admission_type_id','discharge_disposition_id', 'admission_source_id'] , 
                                  y_col = 'tel_1')
    df1['tel_2'] = fillna_with_predictions(df = df1, x_cols = ['admission_type_id','discharge_disposition_id', 'admission_source_id'] , 
                                  y_col = 'tel_2')

    return df1


# function to drop unwanted columns and change data types after the standardizations are done
def cleanup_cols(df):
    
    df1 = df.copy()    

    # these columns are either 'constant' columns with only one value or id fields 
    df1 = df1.drop(columns = ['encounter_id','patient_id','tel_18','tel_20','tel_23','tel_28', 'tel_29',
                              'tel_30', 'tel_41','tel_45','tel_46','tel_47'])  

    return df1

# create a custom Encoder class to handle new unseen values as UNKNOWN 
# this is an extension of the sklearn Label Encoder class
class MyLabelEncoder():
    
    # initialize encoder with base constructor of LabelEncoder
    def __init__(self):
        self.encoder = LabelEncoder()

    def fit(self, values):
        # add a category "UNKNOWN"
        # this will be used as the category when an "unseen" value shows up in the data set
        self.encoder = self.encoder.fit(list(values) + ['UNKNOWN'])
        self.classes_ = self.encoder.classes_

    def transform(self,values):

        values = list(values)
        # for each item in new values
        for item in np.unique(values):
            # if the item is not in the encoder classes (new unseen value   ) mark it as 'UNKNOWN'
            if item not in self.classes_:
                values = ['UNKNOWN' if x == item else x for x in values]

        return self.encoder.transform(values)

# Function to build encoder list for columns
# Generate a dictionary of encoders to be used for test set
def generate_encoder(df):

    encdict = {}

    for col in df.select_dtypes(include = ['object']).columns:
        enc = MyLabelEncoder()
        enc.fit(df[col])
        encdict.update({col : enc})

    return encdict 


# Encode the data using the passed in encoders

def encode_data(df,encoders):

    df1 = df.copy()

    for col in df1.select_dtypes(include = ['object']).columns:
        df1[col]  = encoders[col].transform(df1[col])

    return df1

# function to call all the above function and prepare data for modeling
def clean_data(df):

    df1 = cleanup_cols(df)
    assert len(df1) == len(df), "data loss observed after cleanupcols"

    df1 = standardize(df1)
    assert len(df1) == len(df), "data loss observed after standardization"

    df1 = impute_data(df1)
    assert len(df1) == len(df), "data loss observed after impute"


    return df1

In [None]:
def build_train_model(df):
    df1 = df.copy()
    encoders = generate_encoder(df1)
    train_clean = encode_data(df1,encoders)
    test_clean = encode_data(df1,encoders)
    assert train_clean.isnull().sum().sum() == 0, "missing data still exists in train"
    assert test_clean.isnull().sum().sum() == 0, "missing data still exists in test"

In [None]:
def predict(df):
    df1 = df.copy()
    train_clean = encode_data(df1,encoders)
    test_clean = encode_data(df1,encoders)
    assert train_clean.isnull().sum().sum() == 0, "missing data still exists in train"
    assert test_clean.isnull().sum().sum() == 0, "missing data still exists in test"    

In [16]:
#####################################################
# startig main section
#####################################################

# read train and test files
train = pd.read_csv("patientdata\\train.csv", encoding = 'utf-8')
train_clean = clean_data(train)
# 14696

test = pd.read_csv("patientdata\\test.csv", encoding = 'utf-8')
test_clean = clean_data(test)
# 7970

print("Number of rows in train set = {} and test set = {} ".format(len(train), len(test)))
print("Number of rows in clean train set = {} and test set = {} ".format(len(train_clean), len(test_clean)))

encoders = generate_encoder(train_clean)
train_clean = encode_data(train_clean,encoders)
test_clean = encode_data(test_clean,encoders)

assert train_clean.isnull().sum().sum() == 0, "missing data still exists in train"
assert test_clean.isnull().sum().sum() == 0, "missing data still exists in test"

print("Number of rows in encoded train set = {} and test set = {} ".format(len(train_clean), len(test_clean)))

assert len(train_clean.select_dtypes(include = ['object']).columns) == 0, "non-numerical data exists"

# #############################################################################

# Get model with train data
# generate features

# pos = train_clean[train_clean['diabetesMed'] == 1].sample(5000)
# neg = train_clean[train_clean['diabetesMed'] == 0]
# train_clean = pd.concat([pos,neg])

X = train_clean.loc[:,~train_clean.columns.isin(['diabetesMed'])]
y = train_clean['diabetesMed']

# Split Train Test Sets, ensure stratify to include positive and negative examples
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_state = 10, stratify = y)

# Hyperparameter tuning is done as a separate excercise
# in this platform, we dont have enough memory to run RandomizedSearchCV
# and do a hyperparameter grid search
rfc = RandomForestClassifier(random_state = 1, n_estimators = 500)
rfc.fit(X_train,y_train)

y_test_pred = rfc.predict(X_test)

print("Model test score :{}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix:\n\n")
print(confusion_matrix(y_test,y_test_pred))
print("Classification report:\n\n")
print(classification_report(y_test,y_test_pred))


#############################################################################

# # Predict with test data

X = test_clean.loc[:,:]
pred = rfc.predict(X)
test['diabetesMed'] = pred.reshape(len(pred),1)

# get cols required for submission
submission = test[['encounter_id', 'diabetesMed']]

print(submission['diabetesMed'].value_counts())
# Submit to csv
submission.to_csv('submission.csv', index = False)

Number of rows in train set = 14696 and test set = 7970 
Number of rows in clean train set = 14696 and test set = 7970 
Number of rows in encoded train set = 14696 and test set = 7970 
Model test score :0.9989112683723462
Confusion Matrix:


[[ 371    1]
 [   3 3299]]
Classification report:


              precision    recall  f1-score   support

           0       0.99      1.00      0.99       372
           1       1.00      1.00      1.00      3302

    accuracy                           1.00      3674
   macro avg       1.00      1.00      1.00      3674
weighted avg       1.00      1.00      1.00      3674

1    4555
0    3415
Name: diabetesMed, dtype: int64
