In [7]:
import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn import tree
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

In [8]:
df = pd.read_csv('data/transformed_data.csv', index_col=0)

In [9]:
df.head()

Unnamed: 0,Disease,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,Fungal infection,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X = df.drop(['Disease'], axis=1)
y = df['Disease']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

### Initializing a classifier

In [12]:
clf = DecisionTreeClassifier(
    random_state=42, 
    max_depth=30,
    min_samples_leaf=3)

In [13]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=30, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [14]:
accuracy_score(clf.predict(X_test), y_test)

0.8152709359605911

### Graphviz

In [15]:
import graphviz
# DOT data
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=df.columns.tolist()[1:],  
                                class_names=df.Disease.unique().tolist(),
                                filled=True)

graph = graphviz.Source(dot_data, format="png") 
graph.render('decision_tree')

'decision_tree.png'

### Saving model

In [57]:
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

### Loading model

In [2]:
with open("model.pkl", 'rb') as file:
    pickle_model = pickle.load(file)

### Predicting

In [22]:
X_train.iloc[0]a

itching                 1
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Name: 4308, Length: 132, dtype: int64

In [30]:
a = X_train.iloc[0].to_numpy()

In [34]:
clf.predict(a.reshape(1,132))

array(['Chronic cholestasis'], dtype=object)

### Function which preparing data 

In [35]:
df_sym = pd.read_csv('data/Symptom-severity.csv')
symptoms = df_sym['Symptom'].unique().tolist()

values = df_sym.values.tolist()
s2i = dict(zip(symptoms, range(len(symptoms))))
i2s = {v: k for k,v in s2i.items()}

In [46]:
def transform(lst, s2i):
    disease = lst[0]
    res = [0]*len(s2i)
    for s in lst:
        if s == 0:
            break
        res[s2i[s.replace(" ", "")]] = 1
    return np.array(res)

In [48]:
#transform(['itching'], s2i)

### Production

In [61]:
patient_request = ""

In [62]:
embedded = transform(patient_request.split(","), s2i)

In [63]:
embedded

array([1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [64]:
clf.predict(embedded.reshape(1,132))

array(['Drug Reaction'], dtype=object)