In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## 1. Loading the Dataset

In [3]:
df = pd.read_csv('data/filtered_thyroid_data.csv')

## 2. Data Preprocessing

### Handling Missing Values

In [4]:
# Filling categorical missing values with the mode
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column] = df[column].fillna(df[column].mode()[0])

### Encoding the Target Variable ['Recurred']

In [5]:
# Assuming 'Yes' = 1 (Recurred) and 'No' = 0 (Not Recurred)
le_target = LabelEncoder()
df['Recurred'] = le_target.fit_transform(df['Recurred'])

In [6]:
# Separating Features (X) and Target (y)
X = df.drop('Recurred', axis=1)
y = df['Recurred']

### Encoding Categorical Features (One-Hot Encoding)


In [7]:
# This converts text categories into numerical binary columns (0s and 1s)
categorical_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [8]:
# Splitting the dataset into Training and Testing sets (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Scaling

In [9]:
# KNN is distance-based; Scaling ensures all features contribute equally.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 3. Model Implementation


### 1st Decision Tree Classifier


In [10]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_scaled, y_train)
y_pred_dt = dt_classifier.predict(X_test_scaled)

### 2nd K-Nearest Neighbors (KNN) Classifier


In [11]:
# Using k=5 as a standard starting point
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_scaled, y_train)
y_pred_knn = knn_classifier.predict(X_test_scaled)

[WinError 2] The system cannot find the file specified
  File "c:\Users\Administrator\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\Administrator\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Administrator\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Administrator\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


# 4. Evaluation and Comparison

In [12]:
print("\n--- Model Evaluation Results ---")
# Decision Tree Results
dt_acc = accuracy_score(y_test, y_pred_dt)
print(f"\nDecision Tree Accuracy: {dt_acc:.4f}")
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

# KNN Results
knn_acc = accuracy_score(y_test, y_pred_knn)
print(f"\nKNN Accuracy: {knn_acc:.4f}")
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))


--- Model Evaluation Results ---

Decision Tree Accuracy: 0.9740
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        58
           1       0.95      0.95      0.95        19

    accuracy                           0.97        77
   macro avg       0.97      0.97      0.97        77
weighted avg       0.97      0.97      0.97        77


KNN Accuracy: 0.9481
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        58
           1       1.00      0.79      0.88        19

    accuracy                           0.95        77
   macro avg       0.97      0.89      0.92        77
weighted avg       0.95      0.95      0.95        77



In [13]:
# Comparison Summary
print("\n--- Comparison ---")
if dt_acc > knn_acc:
    print("The Decision Tree model performed better on this test set.")
elif knn_acc > dt_acc:
    print("The KNN model performed better on this test set.")
else:
    print("Both models performed equally well.")


--- Comparison ---
The Decision Tree model performed better on this test set.
