# CCTS 40500: ML Midterm
### Abdallah Aboelela

In [None]:
import pandas as pd 
import numpy as np
import os
import csv

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Summary
#### Exercise A.
#### Exercise B.
#### Exercise C.
#### Exercise D.
#### Exercise E.
#### Exercise F.

# Questions A/B
#### Predict POS for Males and Females using infectious disease history

Here, we will attempt to predict POS using four models (Decision Tree, Random Forest, Naive Bayes, and Linear Regression, and based on the AUC and ROC curve create a model that randomly chooses between them.

In [7]:
# Contains cols sex, diagnosis, state, county, proportion of time, function in appendix
infectious, le = read_file('data/Infectious')

In [8]:
#  Using label encoder values printed above to filter by gender
infectious_m = infectious[infectious.sex == le.index('M')]
infectious_f = infectious[infectious.sex == le.index('F')]

mX_train, mX_test, my_train, my_test = train_test_split(infectious_m.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        infectious_m.diagnosis, random_state = 42)

fX_train, fX_test, fy_train, fy_test = train_test_split(infectious_f.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        infectious_f.diagnosis, random_state = 42)

In [9]:
nb = GaussianNB()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators = 100)
reg = LinearRegression()

classifiers = [nb, dt, rf, reg]
names = ['NB', 'DT', 'RF', 'LinReg']

cols = ['clf','M', 'F']
rows = []

for i, clf in enumerate(classifiers):
    clf.fit(mX_train, my_train)
    y_pred = clf.predict(mX_test)
    m_auc = roc_auc_score(my_test, y_pred)
    
    clf.fit(fX_train, fy_train)
    y_pred = clf.predict(fX_test)
    f_auc = roc_auc_score(fy_test, y_pred)
    
    rows.append([names[i], m_auc, f_auc])

infectious_clfs = pd.DataFrame(rows, columns = cols).set_index('clf')

In [10]:
infectious_clfs

Unnamed: 0_level_0,M,F
clf,Unnamed: 1_level_1,Unnamed: 2_level_1
NB,0.582018,0.592048
DT,0.601548,0.675832
RF,0.602041,0.696429
LinReg,0.680163,0.71329


Across the board, we can see that it is easier to predict autism diagnosis for women than it is for men, and that a linear regression is particularly good, and crosses the 65% AUC boundry in both cases. Decision Tree and Random Forest also work well for women.

# Questions C/D/E
#### Predict POS for Males/Females using any/all combination of diseases using weeks up to 100, 150, and 200 and identify the most predictive diseases

Strategy: Run the four predictive methods separately by gender, disease, and number of weeks considered, to identify which are best. Then, we will combine the most useful diseases into one merged df on c_id, and create a combined classifier that chooses between the best classifiers.

There are two outcomes of the below code:
1. A dataframe with information on the AUC of each of four models applied to each disease, each number of weeks considered, and each gender  
2. A dictionary with the true and predicted values for each of the above

In [42]:
weeks = [100, 150, 200]
cols = ['disease', 'clf', 'weeks', 'm_auc', 'f_auc']
names = ['NB', 'DT', 'RF', 'LinReg']
rows = []
disease_preds = {} # For checking which combination of models/diseases would be best later on with ROC curves

# This takes a few minutes to run
# Try except blocks for when the code fails due to y_pred or y_true only have one value due to the train/test split
for fname in os.listdir('data'):
    # Skipping Hepatic, Urinary because of very few POS values result in code failing/poor predictions
    if fname not in ['Positive', 'Negative'] and '.' not in fname and '_' not in fname:
        disease, le = read_file('data/' + fname)
        disease_m = disease[disease.sex == le.index('M')]
        disease_f = disease[disease.sex == le.index('F')]
        
        mX_train, mX_test, my_train, my_test = train_test_split(disease_m.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        disease_m.diagnosis, random_state = 42)

        fX_train, fX_test, fy_train, fy_test = train_test_split(disease_f.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        disease_f.diagnosis, random_state = 42)
        clf_preds = {}
        for i, clf in enumerate(classifiers):
            week_preds = {}
            for num_weeks in weeks:
                gender_preds = {}
                pred_and_true = {}
                
                clf.fit(mX_train.iloc[:, 1:num_weeks + 2], my_train)
                y_pred = clf.predict(mX_test.iloc[:, 1:num_weeks + 2])
                gender_preds['M_pred'] = y_pred
                gender_preds['M_true'] = my_test
                
                try:
                    m_auc = roc_auc_score(my_test, y_pred)
                except Exception as e:
                    m_auc = None

                clf.fit(fX_train.iloc[:, 1:num_weeks + 2], fy_train)
                y_pred = clf.predict(fX_test.iloc[:, 1:num_weeks + 2])
                gender_preds['F_pred'] = y_pred
                gender_preds['F_true'] = fy_test
                
                try:
                    f_auc = roc_auc_score(fy_test, y_pred)
                except Exception as e:
                    f_auc = None
                
                week_preds[num_weeks] = gender_preds
                    
                new_row = [fname, names[i], num_weeks, m_auc, f_auc]
                print(new_row) # Hidden in submission - this is just to keep track as it runs
                rows.append(new_row)
            clf_preds[names[i]] = week_preds
    disease_preds[fname] = clf_preds

['Musculoskeletal', 'NB', 100, 0.5944175760079714, 0.5140382317801673]
['Musculoskeletal', 'NB', 150, 0.5991080759229105, 0.5385304659498208]
['Musculoskeletal', 'NB', 200, 0.6205677206114663, 0.48805256869772995]
['Musculoskeletal', 'DT', 100, 0.5139135295404281, 0.5827359617682198]
['Musculoskeletal', 'DT', 150, 0.5587892191411281, 0.5821385902031064]
['Musculoskeletal', 'DT', 200, 0.5339757454978491, 0.5824372759856631]
['Musculoskeletal', 'RF', 100, 0.5217391304347826, 0.5833333333333334]
['Musculoskeletal', 'RF', 150, 0.5217391304347826, 0.5833333333333334]
['Musculoskeletal', 'RF', 200, 0.5217391304347826, 0.5833333333333334]
['Musculoskeletal', 'LinReg', 100, 0.6311517243055387, 0.6814516129032258]
['Musculoskeletal', 'LinReg', 150, 0.5606484069312466, 0.5457487056949423]
['Musculoskeletal', 'LinReg', 200, 0.643072398959827, 0.5632218239745121]
['Development', 'NB', 100, 0.49324549447220967, 0.5809165771571786]
['Development', 'NB', 150, 0.47946388005452073, 0.6215538847117794]


['Cardiovascular', 'NB', 100, 0.5442488262910798, 0.4779796311588219]
['Cardiovascular', 'NB', 150, 0.44624413145539904, 0.6237269474263694]
['Cardiovascular', 'NB', 200, 0.5715962441314554, 0.6295072942471787]
['Cardiovascular', 'DT', 100, 0.5578638497652582, 0.49614643545279385]
['Cardiovascular', 'DT', 150, 0.5157276995305164, 0.49710982658959535]
['Cardiovascular', 'DT', 200, 0.6481807511737089, 0.49614643545279385]
['Cardiovascular', 'RF', 100, 0.5333333333333333, 0.5]
['Cardiovascular', 'RF', 150, 0.5315727699530517, 0.5]
['Cardiovascular', 'RF', 200, 0.5666666666666667, 0.5]
['Cardiovascular', 'LinReg', 100, 0.5021126760563381, 0.535645472061657]
['Cardiovascular', 'LinReg', 150, 0.45469483568075125, 0.5298651252408477]
['Cardiovascular', 'LinReg', 200, 0.49530516431924887, 0.3170933113129645]
['Infectious', 'NB', 100, 0.4740597702809611, 0.551306678049706]
['Infectious', 'NB', 150, 0.5185912228813031, 0.5858708025042686]
['Infectious', 'NB', 200, 0.5227183200769451, 0.583878770

In [43]:
disease_week_gender_clfs = pd.DataFrame(rows, columns = cols)
disease_week_gender_clfs

Unnamed: 0,disease,clf,weeks,m_auc,f_auc
0,Musculoskeletal,NB,100,0.594418,0.514038
1,Musculoskeletal,NB,150,0.599108,0.538530
2,Musculoskeletal,NB,200,0.620568,0.488053
3,Musculoskeletal,DT,100,0.513914,0.582736
4,Musculoskeletal,DT,150,0.558789,0.582139
5,Musculoskeletal,DT,200,0.533976,0.582437
6,Musculoskeletal,RF,100,0.521739,0.583333
7,Musculoskeletal,RF,150,0.521739,0.583333
8,Musculoskeletal,RF,200,0.521739,0.583333
9,Musculoskeletal,LinReg,100,0.631152,0.681452


# Appendix

In [6]:
'''
Takes a disease file, and reads into a pandas dataframe.
Creates from values: gender, diagnosis, state, and county
'''
def read_file(fname, new_features = False):
    with open(fname) as f:
        content = f.readlines()
    
    content = [x.strip().split(' ') for x in content]

    max_cols = 0
    for i, line in enumerate(content):
        max_cols = max(max_cols, len(line))

    name = fname[5:]

    columns = ['p_id', 'c_id'] + [name + str(i) for i in range(max_cols - 2)]

    df = pd.read_csv(fname, names = columns, engine = 'python', 
        delim_whitespace = True, index_col = 'c_id')

    df['sex'] = df.p_id.apply(lambda x: x[0])
    df['diagnosis'] = df.p_id.apply(lambda x: x[1:4])
    df['state'] = df.p_id.apply(lambda x: x[4:6])
    df['county'] = df.p_id.apply(lambda x: x[6:9])
    
    end_cols = 0
    if new_features:
        # Using sum overweights other diseases but generally proportional
        df['total'] = df.iloc[:, 1:df.shape[1]-4].sum(axis = 1)
        df['6mos'] = df.iloc[:, 1:28].sum(axis = 1)
        df['1y'] = df.iloc[:, 1:53].sum(axis = 1)
        df['2y'] = df.iloc[:, 1:105].sum(axis = 1)
        
        end_cols = 4
    
    df = df.replace(np.nan, -1)

    for col in df.columns[1:df.shape[1]-end_cols]:
        le = LabelEncoder()
        le = le.fit(df[col])
        
        df[col] = le.transform(df[col])
        
        if col == 'sex':
            final_le = list(le.classes_)
        
    return df, final_le