In [27]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, f1_score, cohen_kappa_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

random_state = 42
np.random.seed(random_state)
filename = './datasets/horse-colic.csv'

# 1

In [28]:
df = pd.read_csv(filename, header=None)
df = df.replace('?', np.nan)
df.columns = range(1, 29)
to_remove = [3, 25, 26, 27, 28]
df = df.drop(to_remove, axis=1)
df.head()

Unnamed: 0,1,2,4,5,6,7,8,9,10,11,...,15,16,17,18,19,20,21,22,23,24
0,2,1,38.5,66,28,3.0,3.0,,2,5.0,...,,,3.0,5.0,45.0,8.4,,,2,2
1,1,1,39.2,88,20,,,4.0,1,3.0,...,,,4.0,2.0,50.0,85.0,2.0,2.0,3,2
2,2,1,38.3,40,24,1.0,1.0,3.0,1,3.0,...,,,1.0,1.0,33.0,6.7,,,1,2
3,1,9,39.1,164,84,4.0,1.0,6.0,2,2.0,...,2.0,5.0,3.0,,48.0,7.2,3.0,5.3,2,1
4,2,1,37.3,104,35,,,6.0,2,,...,,,,,74.0,7.4,,,2,2


In [29]:
for col in df.columns:
    print(f'Number of NaN in column {col}: {df[col].isna().sum()}')

Number of NaN in column 1: 1
Number of NaN in column 2: 0
Number of NaN in column 4: 60
Number of NaN in column 5: 24
Number of NaN in column 6: 58
Number of NaN in column 7: 56
Number of NaN in column 8: 69
Number of NaN in column 9: 47
Number of NaN in column 10: 32
Number of NaN in column 11: 55
Number of NaN in column 12: 44
Number of NaN in column 13: 56
Number of NaN in column 14: 104
Number of NaN in column 15: 106
Number of NaN in column 16: 247
Number of NaN in column 17: 102
Number of NaN in column 18: 118
Number of NaN in column 19: 29
Number of NaN in column 20: 33
Number of NaN in column 21: 165
Number of NaN in column 22: 198
Number of NaN in column 23: 1
Number of NaN in column 24: 0


# 2

In [30]:
target = 23
# dropping rows where target is NaN
df = df.dropna(subset=[target], axis=0)

In [31]:
column_names = df.iloc[:, df.columns != target].columns

In [32]:
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
df_predicting = pd.DataFrame(imputer.fit_transform(df.iloc[:, df.columns != target]), columns=column_names)

In [33]:
X = df_predicting
y = df[target]

# 3

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [35]:
# Decision Tree
score = 'accuracy'
param_grid_dt = {'max_depth': list(range(1, 20))}
model_dt = GridSearchCV(DecisionTreeClassifier(random_state=random_state), param_grid_dt, cv=5, scoring=score)
model_dt.fit(X_train, y_train)
print(f'Best parameters for Decision Tree: {model_dt.best_params_}')

Best parameters for Decision Tree: {'max_depth': 7}


In [36]:
y_pred_dt = model_dt.predict(X_test)

print(f'Confusion matrix for Decision Tree:\n{confusion_matrix(y_test, y_pred_dt, normalize="true")}')

Confusion matrix for Decision Tree:
[[0.70588235 0.17647059 0.11764706]
 [0.29411765 0.47058824 0.23529412]
 [0.55555556 0.11111111 0.33333333]]


In [37]:
f1_model_dt = f1_score(y_test, y_pred_dt, average='macro')
print(f'F1 score for Decision Tree: {f1_model_dt}')

F1 score for Decision Tree: 0.5019607843137256


In [38]:
cohen_kappa_model_dt = cohen_kappa_score(y_test, y_pred_dt)
print(f'Cohen Kappa score for Decision Tree: {cohen_kappa_model_dt}')

Cohen Kappa score for Decision Tree: 0.2822966507177034


# 4

In [40]:
score = 'accuracy'
param_grid_knn = {'n_neighbors': list(range(1, 20))}
model_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5, scoring=score)
model_knn.fit(X_train, y_train)
print(f'Best parameters for KNN: {model_knn.best_params_}')

Best parameters for KNN: {'n_neighbors': 3}


In [41]:
y_pred_knn = model_knn.predict(X_test)
print(f'Confusion matrix for KNN:\n{confusion_matrix(y_test, y_pred_knn, normalize="true")}')

Confusion matrix for KNN:
[[0.85294118 0.14705882 0.        ]
 [0.58823529 0.35294118 0.05882353]
 [0.66666667 0.22222222 0.11111111]]


In [42]:
f1_model_knn = f1_score(y_test, y_pred_knn, average='macro')
print(f'F1 score for KNN: {f1_model_knn}')

cohen_kappa_model_knn = cohen_kappa_score(y_test, y_pred_knn)
print(f'Cohen Kappa score for KNN: {cohen_kappa_model_knn}')

F1 score for KNN: 0.43866513233601845
Cohen Kappa score for KNN: 0.21354451119606765


# 5

In [48]:
results = pd.DataFrame(
    {
        'Model': ['Decision Tree', 'KNN'],
        'F1 score': [f1_model_dt, f1_model_knn],
        'Cohen Kappa score': [cohen_kappa_model_dt, cohen_kappa_model_knn]
    },
    columns=['Model', 'F1 score', 'Cohen Kappa score']
)

results

Unnamed: 0,Model,F1 score,Cohen Kappa score
0,Decision Tree,0.501961,0.282297
1,KNN,0.438665,0.213545
