#Imporing libraries

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pandas as pd
import numpy as np
import pickle

# Loading the data

In [2]:
df = pd.read_csv("thyroid_sample.csv")

In [20]:
df.columns

Index(['age', 'sex', 'TSH', 'T3', 'TT4', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'tumor',
       'psych', 'Class'],
      dtype='object')

In [3]:
X = df.drop('Class', axis=1)
y = df['Class']

#Creating pipeline

In [4]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=0)),
    ('scaler', MinMaxScaler())
])

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Model Training

In [7]:
xgb_model = XGBClassifier(learning_rate=0.3, max_depth=6, n_estimators=257)

In [8]:
knn_model = KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance')

In [9]:
rf_model = RandomForestClassifier(max_depth=14, max_features='sqrt', n_estimators=174)

In [10]:
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('knn', knn_model),
        ('rf', rf_model)
    ],
    voting='hard'
)

In [11]:
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', voting_clf)
])

# Model Deployment

In [12]:
full_pipeline.fit(X, y)

In [13]:
with open('model.pkl', 'wb') as f:
    pickle.dump(full_pipeline, f)

In [14]:
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
y_pred = loaded_model.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

Accuracy: 0.9943385544442348
Classification Report:
                         precision    recall  f1-score   support

compensated hypothyroid       0.98      1.00      0.99      1044
           hyperthyroid       1.00      1.00      1.00      1066
               negative       1.00      0.97      0.99      1086
    primary hypothyroid       1.00      1.00      1.00      1038
  secondary hypothyroid       1.00      1.00      1.00      1065

               accuracy                           0.99      5299
              macro avg       0.99      0.99      0.99      5299
           weighted avg       0.99      0.99      0.99      5299

[[1043    0    1    0    0]
 [   0 1066    0    0    0]
 [  21    5 1058    2    0]
 [   0    0    0 1038    0]
 [   0    0    1    0 1064]]


In [19]:
new_data = pd.DataFrame({
    'age': [25],
    'sex': [1],
    'TSH': [7.64],
    'T3': [8.3],
    'TT4': [6.7],
    'on_thyroxine': [np.nan],
    'query_on_thyroxine': [np.nan],
    'on_antithyroid_medication': [np.nan],
    'sick': [np.nan],
    'pregnant': [np.nan],
    'thyroid_surgery': [np.nan],
    'I131_treatment': [np.nan],
    'query_hypothyroid': [np.nan],
    'query_hyperthyroid': [np.nan],
    'tumor': [np.nan],
    'psych': [np.nan]
}, index=[0])

with open('model.pkl', 'rb') as f:
    loaded_pipeline = pickle.load(f)

predictions = loaded_pipeline.predict(new_data)
print("Predictions:", predictions)

Predictions: ['primary hypothyroid']
