# CCTS 40500: ML Midterm
### Abdallah Aboelela

In [98]:
import pandas as pd 
import numpy as np
import os
import csv
import random

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder

# Summary
#### Exercise A.
#### Exercise B.
#### Exercise C.
#### Exercise D.
#### Exercise E.
#### Exercise F.

# Questions A/B
#### Predict POS for Males and Females using infectious disease history

Here, we will attempt to predict POS using four models (Decision Tree, Random Forest, Naive Bayes, and Linear Regression, and based on the AUC and ROC curve create a model that randomly chooses between them.

In [None]:
# Contains cols sex, diagnosis, state, county, functions in appendix
infectious, le = encode(read_file('data/Infectious'))

In [None]:
#  Using label encoder values printed above to filter by gender
infectious_m = infectious[infectious.sex == le.index('M')]
infectious_f = infectious[infectious.sex == le.index('F')]

mX_train, mX_test, my_train, my_test = train_test_split(infectious_m.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        infectious_m.diagnosis, random_state = 42)

fX_train, fX_test, fy_train, fy_test = train_test_split(infectious_f.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        infectious_f.diagnosis, random_state = 42)

In [10]:
nb = GaussianNB()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators = 100)
reg = LinearRegression()

classifiers = [nb, dt, rf, reg]
names = ['NB', 'DT', 'RF', 'LinReg']

cols = ['clf','M', 'F']
rows = []

for i, clf in enumerate(classifiers):
    clf.fit(mX_train, my_train)
    y_pred = clf.predict(mX_test)
    m_auc = roc_auc_score(my_test, y_pred)
    
    clf.fit(fX_train, fy_train)
    y_pred = clf.predict(fX_test)
    f_auc = roc_auc_score(fy_test, y_pred)
    
    rows.append([names[i], m_auc, f_auc])

infectious_clfs = pd.DataFrame(rows, columns = cols).set_index('clf')

In [11]:
infectious_clfs

Unnamed: 0_level_0,M,F
clf,Unnamed: 1_level_1,Unnamed: 2_level_1
NB,0.582018,0.592048
DT,0.610466,0.675998
RF,0.581633,0.696429
LinReg,0.680163,0.71329


Across the board, we can see that it is easier to predict autism diagnosis for women than it is for men, and that a linear regression is particularly good, and crosses the 65% AUC boundry in both cases. Decision Tree and Random Forest also work well for women.

# Questions C/D/E
#### Predict POS for Males/Females using any/all combination of diseases using weeks up to 100, 150, and 200 and identify the most predictive diseases

Strategy: Run the four predictive methods separately by gender, disease, and number of weeks considered, to identify which are best. Then, we will combine the most useful diseases into one merged df on c_id, and create a combined classifier that chooses between the best classifiers.

There are two outcomes of the below code:
1. A dataframe with information on the AUC of each of four models applied to each disease, each number of weeks considered, and each gender  
2. A dictionary with the true and predicted values for each of the above

In [12]:
weeks = [100, 150, 200]
cols = ['disease', 'clf', 'weeks', 'm_auc', 'f_auc']
names = ['NB', 'DT', 'RF', 'LinReg']
rows = []

# This takes a few minutes to run
# Try except blocks for when the code fails due to y_pred or y_true only have one value due to the train/test split
for fname in os.listdir('data'):
    # Skipping Hepatic, Urinary because of very few POS values result in code failing/poor predictions
    if fname not in ['Positive', 'Negative'] and '.' not in fname and '_' not in fname:
        disease, le = read_file('data/' + fname)
        disease_m = disease[disease.sex == le.index('M')]
        disease_f = disease[disease.sex == le.index('F')]
        
        mX_train, mX_test, my_train, my_test = train_test_split(disease_m.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        disease_m.diagnosis, random_state = 42)

        fX_train, fX_test, fy_train, fy_test = train_test_split(disease_f.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        disease_f.diagnosis, random_state = 42)
        
        for i, clf in enumerate(classifiers):
            for num_weeks in weeks:             
                clf.fit(mX_train.iloc[:, 1:num_weeks + 2], my_train)
                y_pred = clf.predict(mX_test.iloc[:, 1:num_weeks + 2])
                
                try:
                    m_auc = roc_auc_score(my_test, y_pred)
                except Exception as e:
                    m_auc = None

                clf.fit(fX_train.iloc[:, 1:num_weeks + 2], fy_train)
                y_pred = clf.predict(fX_test.iloc[:, 1:num_weeks + 2])
                
                try:
                    f_auc = roc_auc_score(fy_test, y_pred)
                except Exception as e:
                    f_auc = None
                                    
                new_row = [fname, names[i], num_weeks, m_auc, f_auc]
                print(new_row) # Hidden in submission - this is just to keep track as it runs
                rows.append(new_row)

['Musculoskeletal', 'NB', 100, 0.5944175760079714, 0.5140382317801673]
['Musculoskeletal', 'NB', 150, 0.5991080759229105, 0.5385304659498208]
['Musculoskeletal', 'NB', 200, 0.6205677206114663, 0.48805256869772995]
['Musculoskeletal', 'DT', 100, 0.5164289012564707, 0.5830346475507765]
['Musculoskeletal', 'DT', 150, 0.5573917904099933, 0.5818399044205496]
['Musculoskeletal', 'DT', 200, 0.5370500887063455, 0.5818399044205496]
['Musculoskeletal', 'RF', 100, 0.5217391304347826, 0.5833333333333334]
['Musculoskeletal', 'RF', 150, 0.5217391304347826, 0.5833333333333334]
['Musculoskeletal', 'RF', 200, 0.5217391304347826, 0.5833333333333334]
['Musculoskeletal', 'LinReg', 100, 0.6311517243055387, 0.6814516129032258]
['Musculoskeletal', 'LinReg', 150, 0.5606484069312466, 0.5457487056949423]
['Musculoskeletal', 'LinReg', 200, 0.643072398959827, 0.5632218239745121]
['Development', 'NB', 100, 0.49324549447220967, 0.5809165771571786]
['Development', 'NB', 150, 0.47946388005452073, 0.6215538847117794]


['Cardiovascular', 'NB', 100, 0.5442488262910798, 0.4779796311588219]
['Cardiovascular', 'NB', 150, 0.44624413145539904, 0.6237269474263694]
['Cardiovascular', 'NB', 200, 0.5715962441314554, 0.6295072942471787]
['Cardiovascular', 'DT', 100, 0.5561032863849765, 0.49614643545279385]
['Cardiovascular', 'DT', 150, 0.5192488262910798, 0.49710982658959535]
['Cardiovascular', 'DT', 200, 0.5832746478873239, 0.49710982658959535]
['Cardiovascular', 'RF', 100, 0.5333333333333333, 0.5]
['Cardiovascular', 'RF', 150, 0.5315727699530517, 0.5]
['Cardiovascular', 'RF', 200, 0.5666666666666667, 0.5]
['Cardiovascular', 'LinReg', 100, 0.5021126760563381, 0.535645472061657]
['Cardiovascular', 'LinReg', 150, 0.45469483568075125, 0.5298651252408477]
['Cardiovascular', 'LinReg', 200, 0.49530516431924887, 0.3170933113129645]
['Infectious', 'NB', 100, 0.4740597702809611, 0.551306678049706]
['Infectious', 'NB', 150, 0.5185912228813031, 0.5858708025042686]
['Infectious', 'NB', 200, 0.5227183200769451, 0.583878770

In [50]:
disease_week_gender_clfs = pd.DataFrame(rows, columns = cols)
combos_m = disease_week_gender_clfs.drop('f_auc', axis = 1)
combos_f = disease_week_gender_clfs.drop('m_auc', axis = 1)

#### To identify the most predictive diseases, weeks considered, and to eventually decide what is the best combination of diseases to use, we look at the above dataframe and groupby various columns.

In [40]:
print(combos_m.groupby('weeks').mean().sort_values('m_auc'))
print(combos_m.groupby('weeks').max().sort_values('m_auc'))
print(combos_f.groupby('weeks').mean().sort_values('f_auc'))
print(combos_f.groupby('weeks').max().sort_values('f_auc'))

         m_auc
weeks         
150    0.54421
100    0.54566
200    0.56696
       disease clf     m_auc
weeks                       
100    Urinary  RF  0.780488
150    Urinary  RF  0.780488
200    Urinary  RF  0.926829
          f_auc
weeks          
100    0.566018
200    0.581936
150    0.590454
       disease clf     f_auc
weeks                       
200    Urinary  RF  0.780878
100    Urinary  RF  0.872210
150    Urinary  RF  0.889509


#### When grouping by weeks and gender, it appears that when considering male disease history, it is best to consider all 200 weeks - in terms of both mean and maximum AUC). For female disease history, it appears that it is better to use only the first 150 weeks.

#### Now we do the same but consider each disease separately, after dropping values that don't consider all 200 weeks for men or 150 weeks for women. This leaves us with 76 rows for each gender.

In [57]:
combos_m = combos_m[combos_m.weeks == 200]
combos_f = combos_f[combos_f.weeks == 150]

In [58]:
print(combos_m.groupby('disease').mean().sort_values('m_auc', ascending = False))
print(combos_m.groupby('disease').max().sort_values('m_auc', ascending = False))

              weeks     m_auc
disease                      
Neoplastic      200  0.717885
Reproductive    200  0.622051
Respiratory     200  0.616608
Hepatic         200  0.579268
             clf  weeks     m_auc
disease                          
Hepatic       RF    200  0.926829
Neoplastic    RF    200  0.750000
Respiratory   RF    200  0.732294
Reproductive  RF    200  0.656232


#### For men: Interestingly, Neoplastic, Reproductive, and Respiratory disease history appear in the top 5 for both mean and maximum AUCs when grouped by disease history. We will also consider Hepatic because of the very high max AUC. As such, we distill the df one more time to reduce the number of combinations considered manually. We do the same for women afterwards.

In [62]:
best_m_diseases = ['Neoplastic', 'Reproductive', 'Respiratory', 'Hepatic']
combos_m = combos_m[combos_m.disease.isin(best_m_diseases)]

In [63]:
print(combos_f.groupby('disease').mean().sort_values('f_auc', ascending = False))
print(combos_f.groupby('disease').max().sort_values('f_auc', ascending = False))

              weeks     f_auc
disease                      
Reproductive    150  0.792039
Otic            150  0.664080
Development     150  0.630385
             clf  weeks     f_auc
disease                          
Reproductive  RF    150  0.889509
Otic          RF    150  0.733352
Development   RF    150  0.683136


#### For women, Reproductive, OTIC, and Development diseases seem to work well. We will consider these in our final models.

In [64]:
best_f_diseases = ['Reproductive', 'Otic', 'Development']
combos_f = combos_f[combos_f.disease.isin(best_f_diseases)]

#### For both men, and women, we drop the algorithm with the lowest mean AUC for the disease histories we are considering

In [68]:
print(combos_m.groupby('clf').mean().sort_values('m_auc'))
print(combos_f.groupby('clf').mean().sort_values('f_auc'))

        weeks     m_auc
clf                    
NB        200  0.564743
RF        200  0.605811
DT        200  0.623355
LinReg    200  0.741904
        weeks     f_auc
clf                    
NB        150  0.664493
DT        150  0.673100
RF        150  0.675747
LinReg    150  0.768666


#### In both cases, Naive Bayes does worse than the others when considering the AUCs below. Now, we create models for each of the genders.

In [138]:
final_f_combined = combine(best_f_diseases, 'F')
print('done with female')
final_m_combined = combine(best_m_diseases, 'M')

### THIS IS TAKING TOO LONG, SWITCH TO ONLY LOOKING AT FIRST X WEEKS.

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




ValueError: Shape of passed values is (453, 22882), indices imply (453, 21942)

# Appendix

In [87]:
'''
Takes a disease file, and reads into a pandas dataframe.
Creates from values: gender, diagnosis, state, and county
'''
def read_file(fname):
    with open(fname) as f:
        content = f.readlines()
    
    content = [x.strip().split(' ') for x in content]

    max_cols = 0
    for i, line in enumerate(content):
        max_cols = max(max_cols, len(line))

    name = fname[5:]

    columns = ['p_id', 'c_id'] + [name + str(i) for i in range(max_cols - 2)]

    df = pd.read_csv(fname, names = columns, engine = 'python', 
        delim_whitespace = True, index_col = 'c_id')

    df['sex'] = df.p_id.apply(lambda x: x[0])
    df['diagnosis'] = df.p_id.apply(lambda x: x[1:4])
    df['state'] = df.p_id.apply(lambda x: x[4:6])
    df['county'] = df.p_id.apply(lambda x: x[6:9])
    
    return df

In [None]:
def encode(df):
    df = df.replace(np.nan, -1)

    for col in df.columns[1:]:
        le = LabelEncoder()
        le = le.fit(df[col])
        
        df[col] = le.transform(df[col])
        
        if col == 'sex':
            final_le = list(le.classes_)
            
    return df, final_le

In [99]:
def run_on_combined(combined):
    dt = DecisionTreeClassifier()
    rf = RandomForestClassifier(n_estimators = 100)
    rg = LinearRegression()
    
    X_train, X_test, y_train, y_test = train_test_split(combined.drop('diagnosis', axis = 1), 
                                                        combined.diagnosis, random_state = 42)
    
    dt.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    rg.fit(X_train, y_train)
    
    dt_pred = dt.predict(X_test)
    rf_pred = rf.predict(X_test)
    rg_pred = rg.predict(X_test)
    
    preds = pd.concat([dt_pred, rf_pred, rg_pred], axis = 1)
    
    y_pred = preds.apply(lambda row: row[random.uniform(0, 3) // 1])
    
    return roc_auc_score(y_test, y_pred)

In [137]:
def combine(list_of_diseases, gender, new_features = False):
    dfs = [] 
    
    colnum = 152 if gender == 'F' else 202
    
    for fname in list_of_diseases:
        with open('data/' + fname) as f:
            content = f.readlines()
            
        content = [x.strip().split(' ')[:colnum] for x in content]
        cols = ['p_id', 'c_id'] + [fname + str(i) for i in range(colnum - 2)]
        
        df = pd.DataFrame(content, columns = cols)
        df = df.set_index('c_id')
        dfs.append(df)
        
    combined = pd.concat(dfs, axis = 1, join = 'outer')
    
    combined['sex'] = df.p_id.apply(lambda x: x[0])
    combined['diagnosis'] = df.p_id.apply(lambda x: x[1:4])
    combined['state'] = df.p_id.apply(lambda x: x[4:6])
    combined['county'] = df.p_id.apply(lambda x: x[6:9])
    
    combined, le = encode(combined)
        
    return combined[combined.sex == le.index(gender)]

In [None]:
def temporary():
    end_cols = 0
    if new_features:
        # Using sum overweights other diseases but generally proportional
        df['total'] = df.iloc[:, 1:df.shape[1]-4].sum(axis = 1)
        df['6mos'] = df.iloc[:, 1:28].sum(axis = 1)
        df['1y'] = df.iloc[:, 1:53].sum(axis = 1)
        df['2y'] = df.iloc[:, 1:105].sum(axis = 1)
        
        end_cols = 4

In [129]:
s1 = pd.Series(['a', 'b'])
s2 = pd.Series(['c', 'd'])