## HSV2 with Flu

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [2]:
hsv_one_df = pd.read_json('hsv_two_finale.json')

In [3]:
X = hsv_one_df.drop('LBXHE2', axis=1)
y = hsv_one_df['LBXHE2']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [5]:
parametrs = {'n_estimators':[i for i in range(50,1000,50)], 
             'max_depth':[i for i in range(5,10)]}

In [6]:
clf_clf = RandomForestClassifier()

In [7]:
grid_search_cv_clf = GridSearchCV(clf_clf, parametrs, cv=5, n_jobs=-1, verbose=2)

In [8]:
grid_search_cv_clf.fit(X_train, y_train)

Fitting 5 folds for each of 95 candidates, totalling 475 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 475 out of 475 | elapsed:  3.9min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [9]:
grid_search_cv_clf.best_params_

{'max_depth': 9, 'n_estimators': 550}

In [10]:
clf = DecisionTreeClassifier(criterion='gini', max_depth=9)

In [11]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [12]:
prediction = clf.predict(X_test)
accuracy_score(y_test, prediction), recall_score(y_test, prediction), precision_score(y_test, prediction)

(0.9484902309058615, 0.8363636363636363, 0.8932038834951457)

In [15]:
export_graphviz(clf, out_file='clf_2.dot',
               feature_names=X.columns, filled=True)

In [16]:
!dot -Tpng clf_2.dot -o clf_2.png

## Decision Tree example for HSV two:

<img src='clf_2.png'>