# CCTS 40500: ML Midterm
### Abdallah Aboelela

In [184]:
import pandas as pd 
import numpy as np
import os
import csv
import random
from functools import reduce

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder

# Questions A/B
#### Predict POS for Males and Females using infectious disease history

Here, we will attempt to predict POS using four models (Decision Tree, Random Forest, Naive Bayes, and Linear Regression, and based on the AUC and ROC curve create a model that randomly chooses between them.

In [185]:
# Contains cols sex, diagnosis, state, county, functions in appendix
infectious, le = encode(read_file('data/Infectious'))

KeyboardInterrupt: 

In [None]:
#  Using label encoder values printed above to filter by gender
infectious_m = infectious[infectious.sex == le.index('M')]
infectious_f = infectious[infectious.sex == le.index('F')]

mX_train, mX_test, my_train, my_test = train_test_split(infectious_m.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        infectious_m.diagnosis, random_state = 42)

fX_train, fX_test, fy_train, fy_test = train_test_split(infectious_f.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        infectious_f.diagnosis, random_state = 42)

In [None]:
nb = GaussianNB()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators = 100)
reg = LinearRegression()

classifiers = [nb, dt, rf, reg]
names = ['NB', 'DT', 'RF', 'LinReg']

cols = ['clf','M', 'F']
rows = []

for i, clf in enumerate(classifiers):
    clf.fit(mX_train, my_train)
    y_pred = clf.predict(mX_test)
    m_auc = roc_auc_score(my_test, y_pred)
    
    clf.fit(fX_train, fy_train)
    y_pred = clf.predict(fX_test)
    f_auc = roc_auc_score(fy_test, y_pred)
    
    rows.append([names[i], m_auc, f_auc])

infectious_clfs = pd.DataFrame(rows, columns = cols).set_index('clf')

In [None]:
infectious_clfs

Across the board, we can see that it is easier to predict autism diagnosis for women than it is for men, and that a linear regression is particularly good, and crosses the 65% AUC boundry in both cases. Decision Tree and Random Forest also work well for women.

# Questions C/D/E
#### Predict POS for Males/Females using any/all combination of diseases using weeks up to 100, 150, and 200 and identify the most predictive diseases

Strategy: Run the four predictive methods separately by gender, disease, and number of weeks considered, to identify which are best. Then, we will combine the most useful diseases into one merged df on c_id, and create a combined classifier that chooses between the best classifiers.

In [None]:
weeks = [100, 150, 200]
cols = ['disease', 'clf', 'weeks', 'm_auc', 'f_auc']
names = ['NB', 'DT', 'RF', 'LinReg']
rows = []

# This takes a few minutes to run
# Try except blocks for when the code fails due to y_pred or y_true only have one value due to the train/test split
for fname in os.listdir('data'):
    if fname not in ['Positive', 'Negative'] and '.' not in fname and '_' not in fname:
        disease, le = encode(read_file('data/' + fname))
        disease_m = disease[disease.sex == le.index('M')]
        disease_f = disease[disease.sex == le.index('F')]
        
        mX_train, mX_test, my_train, my_test = train_test_split(disease_m.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        disease_m.diagnosis, random_state = 42)

        fX_train, fX_test, fy_train, fy_test = train_test_split(disease_f.iloc[:, 1:].drop('diagnosis', axis = 1), 
                                                        disease_f.diagnosis, random_state = 42)
        
        for i, clf in enumerate(classifiers):
            for num_weeks in weeks:             
                clf.fit(mX_train.iloc[:, 1:num_weeks + 2], my_train)
                y_pred = clf.predict(mX_test.iloc[:, 1:num_weeks + 2])
                
                try:
                    m_auc = roc_auc_score(my_test, y_pred)
                except Exception as e:
                    m_auc = None

                clf.fit(fX_train.iloc[:, 1:num_weeks + 2], fy_train)
                y_pred = clf.predict(fX_test.iloc[:, 1:num_weeks + 2])
                
                try:
                    f_auc = roc_auc_score(fy_test, y_pred)
                except Exception as e:
                    f_auc = None
                                    
                new_row = [fname, names[i], num_weeks, m_auc, f_auc]
                print(new_row) # Hidden in submission - this is just to keep track as it runs
                rows.append(new_row)

In [None]:
disease_week_gender_clfs = pd.DataFrame(rows, columns = cols)
combos_m = disease_week_gender_clfs.drop('f_auc', axis = 1)
combos_f = disease_week_gender_clfs.drop('m_auc', axis = 1)

#### To identify the most predictive diseases, weeks considered, and to eventually decide what is the best combination of diseases to use, we look at the above dataframe and groupby various columns.

In [None]:
print(combos_m.groupby('weeks').mean().sort_values('m_auc'))
print(combos_m.groupby('weeks').max().sort_values('m_auc'))
print(combos_f.groupby('weeks').mean().sort_values('f_auc'))
print(combos_f.groupby('weeks').max().sort_values('f_auc'))

#### When grouping by weeks and gender, it appears that when considering male disease history, it is best to consider all 200 weeks - in terms of both mean and maximum AUC). For female disease history, it appears that it is better to use only the first 150 weeks.

#### Now we do the same but consider each disease separately, after dropping values that don't consider all 200 weeks for men or 150 weeks for women. This leaves us with 76 rows for each gender.

In [None]:
combos_m = combos_m[combos_m.weeks == 200]
combos_f = combos_f[combos_f.weeks == 150]

In [None]:
print(combos_m.groupby('disease').mean().sort_values('m_auc', ascending = False))
print(combos_m.groupby('disease').max().sort_values('m_auc', ascending = False))

#### For men: Interestingly, Neoplastic, Reproductive, and Respiratory disease history appear in the top 5 for both mean and maximum AUCs when grouped by disease history. We will also consider Hepatic because of the very high max AUC. As such, we distill the df one more time to reduce the number of combinations considered manually. We do the same for women afterwards.

In [None]:
best_m_diseases = ['Neoplastic', 'Reproductive', 'Respiratory', 'Hepatic']
combos_m = combos_m[combos_m.disease.isin(best_m_diseases)]

In [None]:
print(combos_f.groupby('disease').mean().sort_values('f_auc', ascending = False))
print(combos_f.groupby('disease').max().sort_values('f_auc', ascending = False))

#### For women, Reproductive, OTIC, and Development diseases seem to work well. We will consider these in our final models.

In [None]:
best_f_diseases = ['Reproductive', 'Otic', 'Development']
combos_f = combos_f[combos_f.disease.isin(best_f_diseases)]

#### For both men, and women, we drop the algorithm with the lowest mean AUC for the disease histories we are considering

In [None]:
print(combos_m.groupby('clf').mean().sort_values('m_auc'))
print(combos_f.groupby('clf').mean().sort_values('f_auc'))

#### In both cases, Naive Bayes does worse than the others when considering the AUCs below. Now, we create models for each of the genders.

In [None]:
print('Men: ', run_on_combined(combine(best_m_diseases, 'M')))
print('Women: ', run_on_combined(combine(best_f_diseases, 'F')))

#### Unfortunately, while these diseases worked fine on their own, they don't seem to be working very well when combined together (esp for Men). Through separate testing, I found that the below diseases seem to work better than expected. For men, using Neoplastic alone ended up doing a lot better than any combination of diseases that I tried 

In [None]:
# Men
run_on_combined(combine(['Neoplastic'], 'M'))

In [None]:
# Women
run_on_combined(combine(['PNS', 'Urinary', 'Endocrine'], 'F'))

In [None]:
# Both using all four diseases + Reproductive and 150 weeks
run_on_combined(combine(['PNS', 'Urinary', 'Endocrine', 'Neoplastic', 'Reproductive']))
# Not very helpful

# Question F
#### We run the same functions as above with new features set as 'True' (by default it is False). I chose the features sickness as a proportion of total time, 6 mos, 1 year, and two years - as well as a sum of the total time sick.

In [None]:
print('Men: ', run_on_combined(combine(['Neoplastic'], gender = 'M', new_features = True)))
print('Women: ', run_on_combined(combine(['PNS', 'Urinary', 'Endocrine'], gender = 'F', new_features = True)))

In [None]:
run_on_combined(combine(['Neoplastic', 'PNS', 'Urinary', 'Endocrine', 'Reproductive'], new_features = True))

# Appendix

In [None]:
'''
Takes a disease file, and reads into a pandas dataframe.
Creates from values: gender, diagnosis, state, and county
'''
def read_file(fname):
    with open(fname) as f:
        content = f.readlines()
    
    content = [x.strip().split(' ') for x in content]

    max_cols = 0
    for i, line in enumerate(content):
        max_cols = max(max_cols, len(line))

    name = fname[5:]

    columns = ['p_id', 'c_id'] + [name + str(i) for i in range(max_cols - 2)]

    df = pd.read_csv(fname, names = columns, engine = 'python', 
        delim_whitespace = True, index_col = 'c_id')

    df['sex'] = df.p_id.apply(lambda x: x[0])
    df['diagnosis'] = df.p_id.apply(lambda x: x[1:4])
    df['state'] = df.p_id.apply(lambda x: x[4:6])
    df['county'] = df.p_id.apply(lambda x: x[6:9])
    
    return df

In [None]:
def encode(df):
    df = df.replace(np.nan, -1)
    df = df.fillna(value = -1)

    for col in df.columns[1:]:
        le = LabelEncoder()
        le = le.fit(df[col].astype(str))
        
        df[col] = le.transform(df[col].astype(str))
        
        if col == 'sex':
            final_le = list(le.classes_)
            
    return df, final_le

In [None]:
def run_on_combined(combined):
    dt = DecisionTreeClassifier()
    rf = RandomForestClassifier(n_estimators = 100)
    rg = LinearRegression()
    
    X_train, X_test, y_train, y_test = train_test_split(combined.drop(['diagnosis', 'p_id'], axis = 1), 
                                                        combined.diagnosis, random_state = 42)
        
    dt.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    rg.fit(X_train, y_train)
    
    dt_pred = pd.DataFrame({'dt' : dt.predict(X_test)})
    rf_pred = pd.DataFrame({'rf' : dt.predict(X_test)})
    rg_pred = pd.DataFrame({'linreg' : dt.predict(X_test)})
    
    preds = pd.concat([dt_pred, rf_pred, rg_pred], axis = 1)
    
    y_pred = preds.apply(lambda row: row[int(random.uniform(0, 1) * 3 // 1)], axis = 1)

    return roc_auc_score(y_test, y_pred)

In [None]:
def combine(list_of_diseases, gender = None, new_features = False):
    dfs = [] 
    
    colnum = 202 if gender == 'M' else 152
    
    for fname in list_of_diseases:
        with open('data/' + fname) as f:
            content = f.readlines()
            
        content = [x.strip().split(' ')[:colnum] for x in content]
        cols = ['p_id', 'c_id'] + [fname + str(i) for i in range(colnum - 2)]
        
        df = pd.DataFrame(content, columns = cols)
        df = df.set_index('c_id')
        
        if new_features:
            # Using mean overweights other diseases but generally proportional
            df[fname + '_total'] = df.iloc[:, 1:].sum(axis = 1)
            df[fname + '_6mos'] = df.iloc[:, 1:28].sum(axis = 1)
            df[fname + '_1y'] = df.iloc[:, 1:53].sum(axis = 1)
            df[fname + '_2y'] = df.iloc[:, 1:105].sum(axis = 1)

        dfs.append(df)
        
    combined = pd.concat(dfs, sort = True)
    
    if new_features:
        filter_cols = [col for col in combined.columns if '_' not in col]
        combined['total'] = combined[filter_cols].sum(axis = 1)
    
    combined['sex'] = combined.p_id.apply(lambda x: x[0])
    combined['diagnosis'] = combined.p_id.apply(lambda x: x[1:4])
    combined['state'] = combined.p_id.apply(lambda x: x[4:6])
    combined['county'] = combined.p_id.apply(lambda x: x[6:9])

    combined, le = encode(combined)
    
    if gender:
        combined = combined[combined.sex == le.index(gender)]
        
    return combined