In [None]:
!pip install scikit-learn==1.0.2
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pickle


In [None]:
# Load the dataset
df = pd.read_csv('Medical diagnosis using AI/Datasets/hypothyroid.csv')

# Data cleaning and preprocessing
df["binaryClass"] = df["binaryClass"].map({"P":0,"N":1})
df = df.replace({"t":1,"f":0})
df = df.replace({"?":np.NAN})
df = df.replace({"F":1,"M":0})
df = df.drop(["TBG", "referral source"], axis=1)

# Impute missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
for col in ['TSH', 'T3', 'TT4', 'T4U', 'FTI']:
    df[col] = imputer.fit_transform(df[[col]])
df['sex'].fillna(df['sex'].mean(), inplace=True)
df['age'].fillna(df['age'].mean(), inplace=True)

# Split features and target
x = df.drop('binaryClass', axis=1)
y = df['binaryClass']

# Split into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Drop constant features
drop_cols = ['FTI', 'FTI measured', 'T4U measured', 'TT4 measured','query on thyroxine','on antithyroid medication','sick', 'pregnant','thyroid surgery','I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary','psych' , 'TSH measured', 'T4U', 'TBG measured']
x_train = x_train.drop(drop_cols, axis=1)
x_test = x_test.drop(drop_cols, axis=1)


In [None]:
# Feature Scaling
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Random Forest with hyperparameter tuning
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(x_train_scaled, y_train)
best_rf = grid_rf.best_estimator_

# XGBoost with hyperparameter tuning
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='accuracy')
grid_xgb.fit(x_train_scaled, y_train)
best_xgb = grid_xgb.best_estimator_

# Evaluate Random Forest
train_pred_rf = best_rf.predict(x_train_scaled)
test_pred_rf = best_rf.predict(x_test_scaled)
train_acc_rf = accuracy_score(y_train, train_pred_rf)
test_acc_rf = accuracy_score(y_test, test_pred_rf)
print('Random Forest - Training Accuracy:', train_acc_rf)
print('Random Forest - Test Accuracy:', test_acc_rf)

# Evaluate XGBoost
train_pred_xgb = best_xgb.predict(x_train_scaled)
test_pred_xgb = best_xgb.predict(x_test_scaled)
train_acc_xgb = accuracy_score(y_train, train_pred_xgb)
test_acc_xgb = accuracy_score(y_test, test_pred_xgb)
print('XGBoost - Training Accuracy:', train_acc_xgb)
print('XGBoost - Test Accuracy:', test_acc_xgb)

# Select best model
if test_acc_rf >= test_acc_xgb:
    best_model = best_rf
    print('Selected Random Forest as best model')
else:
    best_model = best_xgb
    print('Selected XGBoost as best model')

# Save best model
filename = 'Thyroid_model.sav'
pickle.dump(best_model, open(filename, 'wb'))
