In [1]:
# %load ~/.ipython/standard_imports.py
import os
import sys
import logging
import itertools
import functools

logging.basicConfig(level=logging.INFO)
import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.model_selection

import isajosep_util
import isajosep_util.data_frame_plotter

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


Ultimately, our goal is to build a ML model that predicts the Diagnosis (DX), based on symptoms and other factors. In the project you can demonstrate doing some or all of the following:
- Perform exploratory analysis and visualization of data 
- **Build one or more ML models on the data (this sheet)**
- **Select the best model, and report its performance and error analysis (this sheet)**
- Perform feature selection to find most salient features
- Explore ways to deal with unbalanced data
- Try using external ML classifies (e.g. IBM Watson)
etc, etc

In [3]:
formatted_data = pd.read_pickle('/Users/ijoseph/Code/Data/Gyant/final_formatting_all_obs.pkl')

# Encoding 

In [4]:
le = sklearn.preprocessing.LabelEncoder()
le_fit = le.fit(formatted_data.DX)
le_trans = le_fit.transform(formatted_data.DX)

In [5]:
formatted_data['DX_enc'] = le_trans

In [6]:
formatted_data.reset_index(inplace=True)

# Train/Test Split

Split with seed 42

In [7]:
train, test = sklearn.model_selection.train_test_split(formatted_data, random_state=42)

In [8]:
train.shape, test.shape

((15916, 2262), (5306, 2262))

# Model Tuning (via CV)

Use some form of Stochastic Gradient Descent (computes gradients on subset of observations to save time but sacrifice accuracy) for speedups.

## SVC

In [9]:
import sklearn.linear_model

In [11]:
svc_est = sklearn.linear_model.SGDClassifier(loss='hinge', penalty ='elasticnet', verbose=2)

Search $\alpha$ (overlal penalty)  from $1 \times 10^{-8}$ to $1 \times 10^{8}$, `l1_ratio` (relative $L_1$ vs $L_2$ peantly) from 0.1 to 0.9

In [12]:
param_distributions = {'alpha': np.logspace(-8,8), 'l1_ratio': np.linspace(0.1,0.9)}

In [13]:
rscv = sklearn.model_selection.RandomizedSearchCV(estimator=svc_est, param_distributions=param_distributions, n_jobs=3, cv=3, verbose=3)

In [None]:
rscv.fit(X=train.drop(['DX', 'DX_enc'], axis=1), y=train.DX_enc)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
