In [122]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [123]:
#Reading the file
df = pd.read_csv('thyroid.txt', delimiter='\t')

In [124]:
df.head(10)

Unnamed: 0,age,sex,on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,Thyroid
0,41,F,f,f,f,f,f,f,f,f,f,f,f,1.3,2.5,125,1.14,109,P
1,23,F,f,f,f,f,f,f,f,f,f,f,f,4.1,2,102,?,?,P
2,46,M,f,f,f,f,f,f,f,f,f,f,f,0.98,?,109,0.91,120,P
3,70,F,t,f,f,f,f,f,f,f,f,f,f,0.16,1.9,175,?,?,P
4,70,F,f,f,f,f,f,f,f,f,f,f,f,0.72,1.2,61,0.87,70,P
5,18,F,t,f,f,f,f,f,f,f,f,f,f,0.03,?,183,1.3,141,P
6,59,F,f,f,f,f,f,f,f,f,f,f,f,?,?,72,0.92,78,P
7,80,F,f,f,f,f,f,f,f,f,f,f,f,2.2,0.6,80,0.7,115,P
8,66,F,f,f,f,f,f,f,f,f,t,f,f,0.6,2.2,123,0.93,132,P
9,68,M,f,f,f,t,f,f,f,f,f,f,f,2.4,1.6,83,0.89,93,P


In [125]:

# Dataset description
'''
The dataset contains information related to thyroid patients. Here are the details of the columns:
- age: Age of the patient (numeric)
- sex: Gender of the patient (categorical: 'F' for female, 'M' for male)
- on thyroxine: Whether the patient is on thyroxine medication (categorical: 'f' for false, 't' for true)
- on antithyroid medication: Whether the patient is on antithyroid medication (categorical: 'f' for false, 't' for true)
- sick: Whether the patient is sick (categorical: 'f' for false, 't' for true)
- pregnant: Whether the patient is pregnant (categorical: 'f' for false, 't' for true)
- thyroid surgery: Whether the patient has undergone thyroid surgery (categorical: 'f' for false, 't' for true)
- I131 treatment: Whether the patient has received I131 treatment (categorical: 'f' for false, 't' for true)
- lithium: Whether the patient is taking lithium medication (categorical: 'f' for false, 't' for true)
- goitre: Whether the patient has goitre (categorical: 'f' for false, 't' for true)
- tumor: Whether the patient has a tumor (categorical: 'f' for false, 't' for true)
- hypopituitary: Whether the patient has hypopituitary disorder (categorical: 'f' for false, 't' for true)
- psych: Whether the patient has a psychiatric disorder (categorical: 'f' for false, 't' for true)
- TSH: Thyroid-stimulating hormone (numeric)
- T3: Triiodothyronine hormone (numeric)
- TT4: Total thyroxine hormone (numeric)
- T4U: Thyroxine utilization rate (numeric)
- FTI: Free thyroxine index (numeric)
- Thyroid: Target variable indicating whether the patient has thyroid disorder (categorical: 'N' for negative, 'P' for positive)
'''

"\nThe dataset contains information related to thyroid patients. Here are the details of the columns:\n- age: Age of the patient (numeric)\n- sex: Gender of the patient (categorical: 'F' for female, 'M' for male)\n- on thyroxine: Whether the patient is on thyroxine medication (categorical: 'f' for false, 't' for true)\n- on antithyroid medication: Whether the patient is on antithyroid medication (categorical: 'f' for false, 't' for true)\n- sick: Whether the patient is sick (categorical: 'f' for false, 't' for true)\n- pregnant: Whether the patient is pregnant (categorical: 'f' for false, 't' for true)\n- thyroid surgery: Whether the patient has undergone thyroid surgery (categorical: 'f' for false, 't' for true)\n- I131 treatment: Whether the patient has received I131 treatment (categorical: 'f' for false, 't' for true)\n- lithium: Whether the patient is taking lithium medication (categorical: 'f' for false, 't' for true)\n- goitre: Whether the patient has goitre (categorical: 'f' for

In [126]:
df.shape

(3772, 19)

In [127]:
df.dtypes

age                          object
sex                          object
on thyroxine                 object
on antithyroid medication    object
sick                         object
pregnant                     object
thyroid surgery              object
I131 treatment               object
lithium                      object
goitre                       object
tumor                        object
hypopituitary                object
psych                        object
TSH                          object
T3                           object
TT4                          object
T4U                          object
FTI                          object
Thyroid                      object
dtype: object

In [128]:
# Data preprocessing
'''
1. Replaced '?' with NaN for missing values
2. Dropped rows with missing values
3. Converted specific columns to float data type
'''

"\n1. Replaced '?' with NaN for missing values\n2. Dropped rows with missing values\n3. Converted specific columns to float data type\n"

In [129]:
df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)

In [130]:
df[['TSH', 'T3', 'TT4', 'T4U','age', 'FTI']] = df[['TSH', 'T3', 'TT4', 'T4U','age', 'FTI']].astype(float)

In [131]:
# Data exploration
'''
- Displayed the summary statistics of the dataset using df.describe()
- Checked for missing values in the dataset using df.isnull().sum()
- Checked the data types of the columns using df.dtypes
'''

'\n- Displayed the summary statistics of the dataset using df.describe()\n- Checked for missing values in the dataset using df.isnull().sum()\n- Checked the data types of the columns using df.dtypes\n'

In [132]:
df.describe()

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI
count,2643.0,2643.0,2643.0,2643.0,2643.0,2643.0
mean,53.081725,5.035978,2.00115,107.858683,0.99565,109.435906
std,20.367966,23.974851,0.823814,35.460437,0.196445,32.472156
min,1.0,0.005,0.05,2.0,0.25,2.0
25%,37.0,0.5,1.5,88.0,0.87,93.0
50%,55.0,1.3,2.0,103.0,0.98,107.0
75%,69.0,2.6,2.3,124.0,1.09,124.0
max,455.0,530.0,10.6,430.0,2.12,395.0


In [133]:
df.isnull().sum()

age                          0
sex                          0
on thyroxine                 0
on antithyroid medication    0
sick                         0
pregnant                     0
thyroid surgery              0
I131 treatment               0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH                          0
T3                           0
TT4                          0
T4U                          0
FTI                          0
Thyroid                      0
dtype: int64

In [134]:
df.dtypes

age                          float64
sex                           object
on thyroxine                  object
on antithyroid medication     object
sick                          object
pregnant                      object
thyroid surgery               object
I131 treatment                object
lithium                       object
goitre                        object
tumor                         object
hypopituitary                 object
psych                         object
TSH                          float64
T3                           float64
TT4                          float64
T4U                          float64
FTI                          float64
Thyroid                       object
dtype: object

In [135]:
#Separating Dependent and Independent features

In [136]:
X = df.drop('Thyroid', axis=1)
y = df['Thyroid']

In [137]:
# Model selection and performance evaluation
'''
1. Splitted the dataset into training and testing sets using train_test_split
2. Performed data preprocessing using ColumnTransformer for feature scaling and one-hot encoding
3. Fit the logistic regression model on the training data
4. Predicted the target variable for the testing data
5. Evaluated the model's accuracy using accuracy_score
6. Evaluated the model's precision using precision_score
7. Evaluated the model's recall using recall_score
'''

"\n1. Splitted the dataset into training and testing sets using train_test_split\n2. Performed data preprocessing using ColumnTransformer for feature scaling and one-hot encoding\n3. Fit the logistic regression model on the training data\n4. Predicted the target variable for the testing data\n5. Evaluated the model's accuracy using accuracy_score\n6. Evaluated the model's precision using precision_score\n7. Evaluated the model's recall using recall_score\n"

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [139]:
X_train

Unnamed: 0,age,sex,on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI
2290,47.0,F,f,f,f,f,f,f,f,f,f,f,f,0.790,1.2,117.0,0.85,138.0
3576,73.0,F,f,f,f,f,f,f,f,f,f,f,f,0.025,2.3,122.0,0.86,142.0
816,24.0,M,f,f,f,f,f,f,f,f,f,f,f,2.400,2.8,122.0,1.13,109.0
1371,56.0,F,f,f,t,f,f,f,f,f,t,f,f,0.200,2.5,138.0,1.15,119.0
3445,52.0,M,f,f,f,f,t,t,f,f,f,f,f,0.040,1.9,86.0,0.77,112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2348,70.0,F,f,f,f,f,f,f,f,f,f,f,f,0.005,1.7,71.0,1.08,66.0
1552,36.0,M,f,f,f,f,f,f,f,f,f,f,f,1.700,2.6,129.0,1.18,109.0
1597,61.0,F,f,f,f,f,f,f,f,f,f,f,f,0.040,0.8,74.0,0.75,99.0
1846,33.0,M,f,f,f,f,f,f,f,f,f,f,t,1.000,2.2,84.0,0.84,100.0


In [140]:
numeric_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


In [141]:
categorical_features

Index(['sex', 'on thyroxine', 'on antithyroid medication', 'sick', 'pregnant',
       'thyroid surgery', 'I131 treatment', 'lithium', 'goitre', 'tumor',
       'hypopituitary', 'psych'],
      dtype='object')

In [142]:
#These below lines selects the column names of the numeric features and categorical features in the dataset X.
#Select_dtypes method is used to filter columns based on their data type.
numeric_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


In [143]:
#A StandardScaler object is instantiated. This scaler is responsible for standardizing the numeric features by subtracting the mean and scaling to unit variance.
numeric_transformer = StandardScaler()

In [144]:
#An OneHotEncoder object is created, which is used to perform one-hot encoding on the categorical features.
#This encoding converts categorical variables into a binary vector representation, where each category becomes a binary feature column.
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [145]:
#The ColumnTransformer is initialized, which allows for applying different transformations to different subsets of the dataset.
#It takes a list of transformers, where each transformer is defined by a name, a transformation object, and the corresponding features it should be applied to.
#In this case, the numeric features are transformed using numeric_transformer and labeled as 'num', while the categorical features are transformed using categorical_transformer and labeled as 'cat'.

In [146]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train_processed = preprocessor.fit_transform(X_train)

categories = [preprocessor.named_transformers_['cat'].categories_[i]
              for i in range(len(categorical_features))]

X_test_processed = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', OneHotEncoder(categories=categories, handle_unknown='ignore'), categorical_features)
    ]).fit_transform(X_test)

In [147]:
'''
I chose logistic regression for this model based on below factors:
1.The target variable in the dataset is binary, indicating whether a patient has a thyroid disorder or not.
2.Logistic regression is best choice for binary classification problems as it models the probability of the positive class (having a thyroid disorder) given the input features.
3.Logistic regression is having a lower risk of overfitting compared to other models when the dataset has a limited number of observations or a small number of informative features.
4.This makes it suitable for datasets with moderate-sized samples.
5.Logistic regression is computationally efficient and performs well even with a large number of features.
6.It can handle both numeric and categorical features, making it suitable for datasets with mixed data types like the thyroid dataset.
'''

'\nI chose logistic regression for this model based on below factors:\n1.The target variable in the dataset is binary, indicating whether a patient has a thyroid disorder or not. \n2.Logistic regression is best choice for binary classification problems as it models the probability of the positive class (having a thyroid disorder) given the input features.\n3.Logistic regression is having a lower risk of overfitting compared to other models when the dataset has a limited number of observations or a small number of informative features. \n4.This makes it suitable for datasets with moderate-sized samples.\n5.Logistic regression is computationally efficient and performs well even with a large number of features. \n6.It can handle both numeric and categorical features, making it suitable for datasets with mixed data types like the thyroid dataset.\n'

In [148]:
model = LogisticRegression()
model.fit(X_train_processed, y_train)


y_pred = model.predict(X_test_processed)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9395085066162571


In [149]:
#In this dataset the classes are imbalanced, meaning that one class (e.g., presence of disease) is much less frequent than the other class (e.g., absence of disease).
#In this type of cases, accuracy can be misleading as it can be high even if the model simply predicts the majority class most of the time.

In [150]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [151]:
#Recall is important in disease prediction as it focuses on the correct identification of positive cases, especially when dealing with imbalanced class distribution

In [152]:
#Recall = True Positive/(True Positive + False Negative)

In [153]:
precision = precision_score(y_test, y_pred, pos_label='P')
recall = recall_score(y_test, y_pred, pos_label='P')

print("Precision:", precision)
print("Recall:", recall)

Precision: 0.9404761904761905
Recall: 0.9957983193277311


The recall value of 0.99 indicates that the model is able to correctly identify approximately 99.5% of the actual positive cases (individuals with a thyroid disorder). The high recall score suggests that the model is effective at capturing the majority of the positive cases and minimizing false negatives. This means that the model has a low chance of missing individuals who truly have a thyroid disorder.

In [155]:
import pickle
with open('geekschallenge_Bharath_Nalla.pkl', 'wb') as f:
    pickle.dump(model, f)