# CCTS 40500: ML Midterm
### Abdallah Aboelela

To do:
1. Go through first run through csv and figure out which models are best to use
2. Create master df merged on id values
3. Consider only first x weeks of a child's life
4. Add fraction of time sick in first 6 months, 2 years, overall (with different diseases?)

In [2]:
import pandas as pd 
import numpy as np
import os
import csv

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score

In [3]:
'''
Takes a disease file, and reads into a pandas dataframe.
Creates from values: gender, diagnosis, state, and county
'''
def read_file(fname):
    with open(fname) as f:
        content = f.readlines()
    
    content = [x.strip().split(' ') for x in content]

    max_cols = 0
    for i, line in enumerate(content):
        max_cols = max(max_cols, len(line))

    name = fname[5:]

    columns = ['p_id', 'c_id'] + [name + str(i) for i in range(max_cols - 2)]

    df = pd.read_csv(fname, names = columns, engine = 'python', 
        delim_whitespace = True)

    df['sex'] = df.p_id.apply(lambda x: x[0])
    df['diagnosis'] = df.p_id.apply(lambda x: x[1:4])
    df['state'] = df.p_id.apply(lambda x: x[4:6])
    df['county'] = df.p_id.apply(lambda x: x[6:9])

    for col in df.columns[2:]:
        df[col] = pd.Categorical(df[col]).codes

    return df

In [4]:
'''
Runs NB, DT, RF, SVC, and Linear Regression (all with default values) on 
each disease file separately. The result of this is displayed below in df form
'''
def classify():
    rows = [['Disease', 'NB', 'DT', 'RF', 'SVC', 'LR']]

    for fname in os.listdir('data'):
        if '.' in fname or '_' in fname or fname == 'Hepatic':
            # Hepatic causes an error because there are only 4 POS 
            # and they get sorted in weird in the train/test split
            pass

        else:
            print(fname)
            df = read_file('data/' + fname)

            X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 2:].drop('diagnosis', 
                axis = 1), df.diagnosis, random_state = 42)

            nb = GaussianNB()
            nb.fit(X_train, y_train)
            y_pred = nb.predict(X_test)
            nb_auc = roc_auc_score(y_test, y_pred)
            print('NB: ', nb_auc)

            dt = DecisionTreeClassifier()
            dt.fit(X_train, y_train)
            y_pred = dt.predict(X_test)
            dt_auc = roc_auc_score(y_test, y_pred)
            print('DTC: ', dt_auc)
            print(dt.feature_importances_)


            rf = RandomForestClassifier(n_estimators = 100)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_test)
            rf_auc = roc_auc_score(y_test, y_pred)
            print('RFC: ', rf_auc)

            svc = SVC(gamma = 'auto')
            svc.fit(X_train, y_train)
            y_pred = svc.predict(X_test)
            svc_auc = roc_auc_score(y_test, y_pred)
            print('SVC: ', svc_auc)

            reg = LinearRegression()
            reg.fit(X_train, y_train)
            y_pred = reg.predict(X_test)
            reg_auc = roc_auc_score(y_test, y_pred)
            print('LinReg: ', reg_auc)
            print()

In [5]:
'''
Result of the above function, ignoring SVC because of weird values. Using csv because it takes ages to run
'''
simple_aucs = pd.read_csv('general_auc_analysis.csv', index_col = 'Disease')
print(simple_aucs)
print()
print(simple_aucs.mean())
print()
print(simple_aucs.drop('SVC', axis = 1).mean(axis = 1))

                        NB        DT        RF  SVC    LinReg
Disease                                                      
Musculoskeletal   0.599812  0.605690  0.600000  0.5  0.602927
Positive          0.599433  0.626699  0.578947  0.5  0.555611
Development       0.569473  0.627991  0.617021  0.5  0.734572
Metabolic         0.566709  0.594859  0.555556  0.5  0.808795
Neoplastic        0.703814  0.589567  0.590909  0.5  0.717596
Digestive         0.553241  0.624882  0.588235  0.5  0.660349
Negative          0.670250  0.624723  0.616438  0.5  0.791504
Urinary           0.785251  0.494327  0.666667  0.5  0.570502
PNS               0.608439  0.728929  0.735294  0.5  0.783565
Endocrine         0.489496  0.491597  0.500000  0.5  0.428571
Immune            0.576491  0.645830  0.623457  0.5  0.708723
Reproductive      0.542819  0.619301  0.562500  0.5  0.680087
Hematologic       0.641562  0.579149  0.500000  0.5  0.719665
Cardiovascular    0.614819  0.610321  0.617647  0.5  0.695378
Infectio

In [6]:
def merge():
    merged = pd.DataFrame()

    for fname in os.listdir('data'):
        if '.' in fname or '_' in fname or fname in ['Positive', 'Negative']:
            pass

        else:
            print(fname)
            new = read_file('data/' + fname)

            merged = pd.concat([merged, new], sort = True)
    
    return merged

In [None]:
merged = merge()

Musculoskeletal
Development
Metabolic
Neoplastic
Hepatic
Digestive
Urinary
PNS
Endocrine
Immune
Reproductive
Hematologic
Cardiovascular
Infectious
Respiratory
Integumentary
Ophthalmological
Procedural
