WITH HYPER PARAMETER TUNING 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
dataset = pd.read_csv("Thyroid_Diff.csv")
X = dataset.drop("Recurred", axis=1)
y = dataset["Recurred"]

In [5]:
from sklearn.preprocessing import LabelEncoder
label_encode_cols = ['Risk', 'T', 'N', 'M', 'Stage', 'Response']
le = LabelEncoder()
for col in label_encode_cols:
    X[col] = le.fit_transform(X[col])

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
assert isinstance(X, pd.DataFrame)
onehot_encode_cols = ['Gender','Smoking','Hx Smoking','Hx Radiothreapy',
                      'Thyroid Function','Physical Examination','Adenopathy',
                      'Pathology','Focality']
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(drop='if_binary'), onehot_encode_cols)],
    remainder='passthrough'
)
X_transformed = ct.fit_transform(X)
# Get new column names
encoded_cols = ct.named_transformers_['encoder'].get_feature_names_out(onehot_encode_cols)
remaining_cols = [col for col in X.columns if col not in onehot_encode_cols]
all_cols = list(encoded_cols) + remaining_cols

# Convert to DataFrame with proper column names
X_df = pd.DataFrame(X_transformed, columns=all_cols)

In [7]:
y_encoded = LabelEncoder().fit_transform(y)
print(y_encoded)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_encoded, random_state=0,test_size=0.2) 

In [9]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [10]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]]


In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

[[48  3]
 [ 5 21]]
              precision    recall  f1-score   support

           0       0.91      0.94      0.92        51
           1       0.88      0.81      0.84        26

    accuracy                           0.90        77
   macro avg       0.89      0.87      0.88        77
weighted avg       0.90      0.90      0.90        77



In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],       # Splitting criteria
    'max_depth': [None, 5, 10, 20, 30],                 # Depth of the tree
    'min_samples_split': [2, 5, 10],                    # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2, 4],                      # Minimum samples at a leaf node
    'max_features': [None, 'sqrt', 'log2'],             # Number of features to consider when looking for the best split
    'splitter': ['best', 'random']                      # Strategy used to choose the split at each node
}
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best params and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Evaluate on test set
best_model = grid_search.best_estimator_
print(best_model)
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

Best Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'splitter': 'best'}
Best Cross-Validation Accuracy: 0.9771549444738235
DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=10, random_state=0)
              precision    recall  f1-score   support

           0       0.91      0.96      0.93        51
           1       0.91      0.81      0.86        26

    accuracy                           0.91        77
   macro avg       0.91      0.88      0.90        77
weighted avg       0.91      0.91      0.91        77



In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy is: ', accuracy_score(y_test, y_pred))
print('Precision is: ', precision_score(y_test, y_pred, average='weighted'))    
print('Recall is: ', recall_score(y_test, y_pred, average='weighted'))
print('F1-score is: ', f1_score(y_test, y_pred, average='weighted'))

Accuracy is:  0.9090909090909091
Precision is:  0.909310496267018
Recall is:  0.9090909090909091
F1-score is:  0.9076066790352505
