In [21]:
import pandas as pd
import numpy as np

download_path = '~/Downloads/OHASDataset.csv'

def load_csv(path):
    return pd.read_csv(path)

data = load_csv(download_path)


In [22]:
data.describe()

Unnamed: 0,Weight,Height,Age,BMI_Level
count,2129.0,2129.0,2129.0,2129.0
mean,263.462189,172.744481,25.689056,30.451574
std,366.432493,11.167576,6.185196,6.155878
min,42.0,140.0,14.0,15.96
25%,92.0,162.0,21.0,25.935
50%,145.0,172.0,25.0,30.21
75%,290.0,181.0,29.0,34.32
max,3363.0,213.0,57.0,53.13


In [23]:
data.dtypes

Disease         object
Disease_CUI     object
Symptoms        object
Symptom_CUI     object
Weight           int64
Height           int64
Intensity       object
Severity        object
Age              int64
Gender          object
BMI_Level      float64
Region          object
Season          object
dtype: object

In [24]:
#I don't see Weight or Height being useful because BMI is simply a combonation of those factors
#so I am going to simply remove them
data.dropna(['Height'], axis=1, inplace=True)
data.dropna(['Weight'], axis=1, inplace=True)

Unnamed: 0,Disease,Disease_CUI,Symptoms,Symptom_CUI,Height,Intensity,Severity,Age,Gender,BMI_Level,Region,Season
0,influenza,C0162565,uncoordi162tion,C0039239,180,high,medium,24,female,27.900,southwest,Summer
1,influenza,C0162565,fever,C0000737,170,low,medium,23,male,33.770,southeast,Summer
2,influenza,C0162565,pleuritic pain,C0235704,162,low,low,24,male,33.000,southeast,Summer
3,influenza,C0162565,snuffle,C0030554,162,high,medium,34,male,22.705,northwest,Summer
4,influenza,C0162565,throat sore,C0030552,185,low,high,21,male,28.880,northwest,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...
2124,migraine disorders,C0020532,dizziness,C0005959,162,high,high,20,male,38.170,northwest,Summer
2125,migraine disorders,C0020532,numbness,C0031256,173,low,low,20,female,36.860,northeast,Summer
2126,migraine disorders,C0020532,162usea,C0014591,168,high,high,24,female,32.395,southwest,Winter
2127,migraine disorders,C0020532,fever,C0025323,168,medium,medium,17,male,42.750,southeast,Winter


In [25]:
#Delete uncesessary/redundant rows
data.drop(['Disease_CUI'], axis=1, inplace=True)
data.drop(['Symptom_CUI'], axis=1, inplace=True)
data.drop(['Region'], axis=1, inplace=True)
#majority of cases both severity and intensity hold the same value, so may aswell delete one of them...
data.drop(['Severity'], axis=1, inplace=True)

In [98]:
collumns = ['Symptoms', 'Intensity', 'Age', 'Gender', 'BMI_Level']

#drop unfilled rows
data = data.dropna()

In [149]:
#Now we need to split the test and training data, and remove the labels
from sklearn.model_selection import train_test_split

tr_data, te_data = train_test_split(data, test_size=0.2, random_state=58)

trainLabels = tr_data[['Disease']].copy()
testLabels = te_data[['Disease']].copy()

train = tr_data.drop(['Disease'], axis=1)
test = te_data.drop(['Disease'], axis=1)


In [150]:
#Convert train/test data to numpy array

def to_arr(data):
    return data.to_numpy()

train = to_arr(train)
test = to_arr(test)

In [151]:
train

array([['haemoptysis', 68, 180, ..., 'male', 35.8, 'Winter'],
       ['diarrhea', 56, 162, ..., 'male', 29.07, 'Summer'],
       ['sleepy', 408, 167, ..., 'female', 20.235, 'Winter'],
       ...,
       ['tremor', 1337, 175, ..., 'male', 35.75, 'Summer'],
       ['sweating increased', 1284, 162, ..., 'male', 27.645, 'Summer'],
       ['debilitation', 86, 162, ..., 'male', 34.9, 'Summer']],
      dtype=object)

In [159]:
#Now we need to encode all text attributes to OneHot encoding
from sklearn.preprocessing import OneHotEncoder

def convert_to_onehot(x):
    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(x)
    return enc.transform(x).toarray()

#Convert labels to OneHot
train_labels = convert_to_onehot(trainLabels)
test_labels = convert_to_onehot(testLabels)


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
#integer_encoded = train.reshape(len(train), 1)
onehot_encoded = onehot_encoder.fit_transform(train)
train = onehot_encoded

In [169]:
print(train)

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [164]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train, train_labels)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [167]:
print(clf.predict([train[100]]))

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]]
