Importing the Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

Importing the Dataset

In [3]:
dataset = pd.read_csv('Datasets/dataset_revised.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
dataset.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Disease
0,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,Fungal infection
1,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,Fungal infection
2,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,Fungal infection
3,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,Fungal infection
4,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,Fungal infection


Remove underscore

In [5]:
for col in dataset.columns:
    dataset[col] = dataset[col].str.replace('_', ' ')
    
dataset.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Disease
0,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,Fungal infection
1,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,,Fungal infection
2,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,,Fungal infection
3,itching,skin rash,dischromic patches,,,,,,,,,,,,,,,Fungal infection
4,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,,Fungal infection


Flatten the dataset values

In [6]:
cols = dataset.columns

data = dataset[cols].values.flatten()

reshaped = pd.Series(data)
reshaped = reshaped.str.strip()
reshaped = reshaped.values.reshape(dataset.shape)

dataset = pd.DataFrame(reshaped, columns = dataset.columns)
dataset.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Disease
0,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,Fungal infection
1,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,,Fungal infection
2,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,,Fungal infection
3,itching,skin rash,dischromic patches,,,,,,,,,,,,,,,Fungal infection
4,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,,Fungal infection


Fill null values with 0

In [7]:
dataset = dataset.fillna(0)
dataset.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Disease
0,itching,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection
1,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection
2,itching,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection
3,itching,skin rash,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection
4,itching,skin rash,nodal skin eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection


Symptoms severity

In [23]:
df_severity = pd.read_csv('Datasets/Symptom-severity.csv')
df_severity['Symptom'] = df_severity['Symptom'].str.replace('_',' ')
df_severity.head(10)

Unnamed: 0,Symptom,weight
0,itching,1
1,skin rash,3
2,nodal skin eruptions,4
3,continuous sneezing,4
4,shivering,5
5,chills,3
6,joint pain,3
7,stomach pain,5
8,acidity,3
9,ulcers on tongue,4


Split dataset to the dependent and independent variables

In [8]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

[['itching' 'skin rash' 'nodal skin eruptions' ... 0 0 0]
 ['skin rash' 'nodal skin eruptions' 'dischromic  patches' ... 0 0 0]
 ['itching' 'nodal skin eruptions' 'dischromic  patches' ... 0 0 0]
 ...
 ['burning micturition' 'bladder discomfort' 'foul smell of urine' ... 0
  0 0]
 ['skin rash' 'joint pain' 'skin peeling' ... 0 0 0]
 ['skin rash' 'high fever' 'blister' ... 0 0 0]]
['Fungal infection' 'Fungal infection' 'Fungal infection' ...
 'Urinary tract infection' 'Psoriasis' 'Impetigo']


Changing categorical data to numerical data

In [15]:
cols = dataset.columns[:-1]
one_hot_df = pd.get_dummies(dataset[cols])
one_hot_df['Disease'] = dataset['Disease']
one_hot_df.to_csv('encoded_dataset.csv', index=False)

      Symptom_1_acidity  Symptom_1_back pain  Symptom_1_bladder discomfort  \
0                     0                    0                             0   
1                     0                    0                             0   
2                     0                    0                             0   
3                     0                    0                             0   
4                     0                    0                             0   
...                 ...                  ...                           ...   
4915                  0                    0                             0   
4916                  0                    0                             0   
4917                  0                    0                             0   
4918                  0                    0                             0   
4919                  0                    0                             0   

      Symptom_1_breathlessness  Symptom_1_burning micturition  

In [21]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(X)

[['itching' 'skin rash' 'nodal skin eruptions' ... 0 0 0]
 ['skin rash' 'nodal skin eruptions' 'dischromic  patches' ... 0 0 0]
 ['itching' 'nodal skin eruptions' 'dischromic  patches' ... 0 0 0]
 ...
 ['burning micturition' 'bladder discomfort' 'foul smell of urine' ... 0
  0 0]
 ['skin rash' 'joint pain' 'skin peeling' ... 0 0 0]
 ['skin rash' 'high fever' 'blister' ... 0 0 0]]


Split dataset to train and test set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

Training the Decision Tree Classification Model on the training set 

In [20]:
classifier_SVC = SVC(kernel = 'linear', random_state = 0)
classifier_SVC.fit(X_train, y_train)

ValueError: could not convert string to float: 'chills'