In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('dataSets/migraine_symptom_classification.csv')
df.head()

Unnamed: 0,Age,Duration,Frequency,Location,Character,Intensity,Nausea,Vomit,Phonophobia,Photophobia,...,Vertigo,Tinnitus,Hypoacusis,Diplopia,Defect,Ataxia,Conscience,Paresthesia,DPF,Type
0,30,1,5,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
1,50,3,5,1,1,3,1,1,1,1,...,1,0,0,0,0,0,0,0,0,Typical aura with migraine
2,53,2,1,1,1,2,1,1,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
3,45,3,5,1,1,3,1,0,1,1,...,1,0,0,0,0,0,0,0,0,Typical aura with migraine
4,53,1,1,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,1,Typical aura with migraine


In [3]:
# Checking for null values
df.isna().sum()

Age            0
Duration       0
Frequency      0
Location       0
Character      0
Intensity      0
Nausea         0
Vomit          0
Phonophobia    0
Photophobia    0
Visual         0
Sensory        0
Dysphasia      0
Dysarthria     0
Vertigo        0
Tinnitus       0
Hypoacusis     0
Diplopia       0
Defect         0
Ataxia         0
Conscience     0
Paresthesia    0
DPF            0
Type           0
dtype: int64

In [5]:
# Checking for duplicates
df[df.duplicated()]

Unnamed: 0,Age,Duration,Frequency,Location,Character,Intensity,Nausea,Vomit,Phonophobia,Photophobia,...,Vertigo,Tinnitus,Hypoacusis,Diplopia,Defect,Ataxia,Conscience,Paresthesia,DPF,Type
94,28,1,5,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,1,Typical aura with migraine
118,28,1,5,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,1,Typical aura with migraine
169,31,1,1,1,1,2,1,1,1,1,...,0,0,0,0,0,0,0,0,1,Typical aura with migraine
200,50,1,1,1,1,3,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
280,22,1,1,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
281,35,1,1,1,1,3,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine


In [6]:
# There is not that many so we will just drop them
df.drop_duplicates(inplace=True)

In [7]:
df[df.duplicated()]

Unnamed: 0,Age,Duration,Frequency,Location,Character,Intensity,Nausea,Vomit,Phonophobia,Photophobia,...,Vertigo,Tinnitus,Hypoacusis,Diplopia,Defect,Ataxia,Conscience,Paresthesia,DPF,Type


In [8]:
df.nunique()

Age            51
Duration        3
Frequency       8
Location        3
Character       3
Intensity       4
Nausea          2
Vomit           2
Phonophobia     2
Photophobia     2
Visual          5
Sensory         3
Dysphasia       2
Dysarthria      2
Vertigo         2
Tinnitus        2
Hypoacusis      2
Diplopia        2
Defect          2
Ataxia          1
Conscience      2
Paresthesia     2
DPF             2
Type            7
dtype: int64

In [None]:
# We will first just do without binning

categoricalCols = df.drop(columns='Age').columns
categoricalCols

Index(['Duration', 'Frequency', 'Location', 'Character', 'Intensity', 'Nausea',
       'Vomit', 'Phonophobia', 'Photophobia', 'Visual', 'Sensory', 'Dysphasia',
       'Dysarthria', 'Vertigo', 'Tinnitus', 'Hypoacusis', 'Diplopia', 'Defect',
       'Ataxia', 'Conscience', 'Paresthesia', 'DPF', 'Type'],
      dtype='object')

In [20]:
df['ageRanges'] = pd.qcut(df['Age'], q=5, duplicates='drop', labels=False)
df.drop(columns='Age', inplace=True)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinalEncoder = OrdinalEncoder()
# df[categoricalCols] = ordinalEncoder.fit_transform(df[categoricalCols])
df = ordinalEncoder.fit_transform(df)

array([[0., 4., 1., ..., 0., 5., 2.],
       [2., 4., 1., ..., 0., 5., 4.],
       [1., 0., 1., ..., 0., 5., 4.],
       ...,
       [2., 1., 1., ..., 1., 0., 2.],
       [2., 0., 1., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 3.]])

In [23]:
# Splitting

from sklearn.model_selection import train_test_split


target = 'Type'
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100, stratify=y)

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [17]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier


param_grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 5, 10],
    'max_depth': [3,5,10, None],
}
cv = StratifiedKFold(n_splits=3)
model = GridSearchCV(DecisionTreeClassifier(class_weight='balanced', random_state=42), param_grid, cv=cv, scoring='accuracy')
model.fit(X_test, y_test)

bestModel = model.best_estimator_

In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = bestModel.predict(X_test)

print("Best Parameters:", model.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2}
Accuracy: 1.0
Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         4
         1.0       1.00      1.00      1.00         5
         2.0       1.00      1.00      1.00        12
         3.0       1.00      1.00      1.00         3
         4.0       1.00      1.00      1.00         3
         5.0       1.00      1.00      1.00        48
         6.0       1.00      1.00      1.00         4

    accuracy                           1.00        79
   macro avg       1.00      1.00      1.00        79
weighted avg       1.00      1.00      1.00        79

Confusion Matrix:
 [[ 4  0  0  0  0  0  0]
 [ 0  5  0  0  0  0  0]
 [ 0  0 12  0  0  0  0]
 [ 0  0  0  3  0  0  0]
 [ 0  0  0  0  3  0  0]
 [ 0  0  0  0  0 48  0]
 [ 0  0  0  0  0  0  4]]
