## Import data

In [17]:
import pandas as pd

continuous = "continuous"
# read data 
binary_values = ["f","t"]
col_info={"age": continuous,
          "sex":["M", "F"],
          "on thyroxine":binary_values,
          "query on thyroxine":binary_values,
          "on antithyroid medication":binary_values,
          "sick":binary_values,
          "pregnant":binary_values,
          "thyroid surgery":binary_values,
          "I131 treatment":binary_values,
          "query hypothyroid":binary_values,
          "query hyperthyroid":binary_values,
          "lithium":binary_values,
          "goitre":binary_values,
          "tumor":binary_values,
          "hypopituitary":binary_values,
          "psych":binary_values,
          "TSH measured":binary_values,
          "TSH":continuous,
          "T3 measured":binary_values,
          "T3":continuous,
          "TT4 measured":binary_values,
          "TT4":continuous,
          "T4U measured":binary_values,
          "T4U":continuous,
          "FTI measured":binary_values,
          "FTI":continuous,
          "TBG measured":binary_values,
          "TBG":continuous,
          "referral source": ["WEST", "STMW", "SVHC", "SVI", "SVHD", "other"],
          "record identification":continuous
          } 

data_url = "./thyroid+disease/thyroid0387.data"
# data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/thyroid0387.data"
df = pd.read_csv(data_url, names=col_info.keys(), header=None, index_col=False)
df.shape[0]


9172

# Data cleaning 

In [18]:
# check for NA values in data 

df.replace("?", pd.NA, inplace=True)

incompelete_columns = []

# check which columns have a large amount of missing data
print(f"toatl number of rows : {df.shape[0]} ")
print("=====================")
for name in col_info.keys():
    na_count = df[name].isna().sum()
    if na_count != 0:
        incompelete_columns.append(name)
        print(f"{name} column has {na_count} NA values")
   

print("=====================")

# to see if there are missing values for tests that where conducted 
# Get columns related to measurements
measurement_related_cols = [key for key in col_info.keys() if "measured" in key]

for measurement_col in measurement_related_cols:    
    # Select rows where measurement was conducted
    rows_with_measurement = df[df[measurement_col] == "t"]
    rows_without_measurement = df[df[measurement_col] == "f"]
    
    measurement_name = measurement_col.split()[0]
    # Count the number of valid measurements
    valid_measurement_count = rows_with_measurement[measurement_name].notna().sum()
    # Count the number of invalid measurements
    invalid_measurement_count = rows_with_measurement[measurement_name].isna().sum()
    # Count the number of measurements taken without being requested
    measurement_without_request_count = rows_without_measurement[measurement_name].notna().sum()
    
    # Print the results
    print(f"{measurement_name} :\n"
          f"Rows with measurement = {rows_with_measurement.shape[0]}\n"
          f"Valid measurement count = {valid_measurement_count}\n"
          f"Invalid measurement count = {invalid_measurement_count}\n"
          f"Measurement taken without request count = {measurement_without_request_count}\n"
          "=======================\n"
         )


toatl number of rows : 9172 
sex column has 307 NA values
TSH column has 842 NA values
T3 column has 2604 NA values
TT4 column has 442 NA values
T4U column has 809 NA values
FTI column has 802 NA values
TBG column has 8823 NA values
TSH :
Rows with measurement = 8330
Valid measurement count = 8330
Invalid measurement count = 0
Measurement taken without request count = 0

T3 :
Rows with measurement = 6568
Valid measurement count = 6568
Invalid measurement count = 0
Measurement taken without request count = 0

TT4 :
Rows with measurement = 8730
Valid measurement count = 8730
Invalid measurement count = 0
Measurement taken without request count = 0

T4U :
Rows with measurement = 8363
Valid measurement count = 8363
Invalid measurement count = 0
Measurement taken without request count = 0

FTI :
Rows with measurement = 8370
Valid measurement count = 8370
Invalid measurement count = 0
Measurement taken without request count = 0

TBG :
Rows with measurement = 349
Valid measurement count = 349

as we can see above we have found some columns with missing data and need to see what we do about it 
we also found that no missing data exists for mesurments that where taken for </br>

TSH  
T3  
TT4  
T4U  
FTI  
TBG

there for all missing values for those columns are for tests that were not conducted and so 
should remain NA TODO reward the entire thing later 

we can also see that everywhere we have a value for a column named messurment it was preformed there for we can drop the messurment columns as they are suporflous data 
additionaly this saves us the trouble of checking if the model that we train on the data 
we should note that some of the missing data is for test that might not have been taken and so there shoulddnt be any values 


In [19]:
# droping messured columns as described earlyier 
df.drop(columns=measurement_related_cols, inplace=True)
# remove from col_info for later preprocessing 
for col in measurement_related_cols:
    col_info.pop(col ,None)

column_names = df.columns.tolist()

# Print the column names
print(column_names)

['age', 'sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG', 'referral source', 'record identification']


In [20]:
# fill in missing sex col values 
import numpy as np
sex_counts = df["sex"].value_counts()
print(f"sex_counts before : {sex_counts}")

# if sex is NA and pregenent is True then we set sex to female
df.loc[(df["sex"].isna()) & (df["pregnant"] == "t"), "sex"] = "F"
# Count the occurrences of each sex
sex_counts = df["sex"].value_counts()
print(f"sex_counts after pregnenet : {sex_counts}")
# fill in missing sex col values with the same distribution of the sex that appear in the df 
# Calculate the proportion of each sex
total_count = sex_counts.sum()
male_proportion = sex_counts.get('M', 0) / total_count
female_proportion = sex_counts.get('F', 0) / total_count

# Fill missing values randomly based on the distribution
missing_indices = df[df['sex'].isna()].index
missing_count = len(missing_indices)
fill_values = np.random.choice(['M', 'F'], size=missing_count,
                               p=[male_proportion, female_proportion])
df.loc[missing_indices, 'sex'] = fill_values
sex_counts = df['sex'].value_counts()
print(f"sex_counts after : {sex_counts}")

sex_counts before : sex
F    6073
M    2792
Name: count, dtype: int64
sex_counts after pregnenet : sex
F    6077
M    2792
Name: count, dtype: int64
sex_counts after : sex
F    6278
M    2894
Name: count, dtype: int64


In [21]:
# Extract the diagnosis from the last column and put it in a new column 
diagnosis_pattern = r"(.*)(?=\[\d*\])" 
df["diagnosis"] = df["record identification"].str.extract(diagnosis_pattern)

# extract the number from the 'record_identification' column
# df["record identification"] = df["record identification"].str.extract(r"\[(\d+)\]")
# Print column names
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,psych,TSH,T3,TT4,T4U,FTI,TBG,referral source,record identification,diagnosis
0,29,F,f,f,f,f,f,f,f,t,...,f,0.3,,,,,,other,-[840801013],-
1,29,F,f,f,f,f,f,f,f,f,...,f,1.6,1.9,128.0,,,,other,-[840801014],-
2,41,F,f,f,f,f,f,f,f,f,...,f,,,,,,11.0,other,-[840801042],-
3,36,F,f,f,f,f,f,f,f,f,...,f,,,,,,26.0,other,-[840803046],-
4,32,F,f,f,f,f,f,f,f,f,...,f,,,,,,36.0,other,S[840803047],S


In [22]:
# check for duplicate rows
duplicate_rows = df.duplicated()
duplicated_df = df[duplicate_rows]
num_duplicates = duplicated_df.shape[0]
print(f"complete dulpicate rows : {num_duplicates}" )

# check for duplicate record identification
duplicate_values = df["record identification"].duplicated()
duplicated_rows = df[duplicate_values]
num_duplicates = duplicated_rows.shape[0]
print(f"rows with duplicated record identification {num_duplicates}")


complete dulpicate rows : 0
rows with duplicated record identification 0


as we can see from the code above there are no duplicate rows or record identification
TODO check if record identification means patiant number 

In [23]:
# drop "record identification" and "referral source" columns as they do not contain relvent data  
df.drop(columns=["record identification", "referral source"], inplace=True)
col_info.pop("record identification")
col_info.pop("referral source")


['WEST', 'STMW', 'SVHC', 'SVI', 'SVHD', 'other']

In [24]:
# check all categorical data collumns have knwon categorical data 
# pattern checks integers or floats 
num_pattern = r"\d+(?:\.\d+)?"

for column, value_options in col_info.items():
    # Only iterate over each categorical column
    if value_options == continuous:
        
       # Check if each value in the column matches the integer or float pattern
        non_numeric_values = df[column][(~df[column].isna()) & (~df[column].astype(str).str.match(num_pattern))]
        # Convert non-numeric values to a list
        non_numeric_values_list = non_numeric_values.tolist()
        # Print the non-numeric values
        if len(non_numeric_values_list) != 0:
            print(f"Non-numeric values in column '{column}':")
            print(non_numeric_values_list)
    
    else:

        unique_values = df[column].unique()
        # Check if all unique values are within known categories
        unknown_values = [value for value in unique_values 
                          if (not pd.isna(value) ) and (value not in value_options)]
        if len(unknown_values) != 0:    
            print(f"The following unknown categories are found in {column} column:", unknown_values)
    
print("done")

done


as we can see from running the code above all values in the data base are known values 
 there are no surprising values in any column 
TODO word this better 


In [25]:

for col , values in col_info.items():
    if values == continuous:    
        print(f"{col} info : max {df[col].dropna().max()} | min {df[col].dropna().min()}")



age info : max 65526 | min 1
TSH info : max 99 | min 0.005
T3 info : max 9.5 | min 0.05
TT4 info : max 99 | min 10
T4U info : max 2.33 | min 0.17
FTI info : max 99 | min 1.4
TBG info : max 96 | min 0.1


a  max value 65526 for age is probubly an error this column should be inevstigated for further data problems 
 
 
 TSH, T3, TT4, T4U, FTI  :  range of values seem plausible as they fall within the typical reference range.

TBG (Thyroxine-Binding Globulin): The TBG values range from 0.1 to 96. While the maximum value seems high, it's not necessarily impossible. However, extremely high values should be reviewed for accuracy or potential outliers in the future.
 

In [26]:
# inspecting age column for further data errors as explained above 


ages_gt_100 = df[df['age'] > 130]
ages_gt_100.shape[0]


4

we assume that any age value larger then 130 is an error
we can see that there is only a small number of rows with an age older then 130 
we assume that removeing such a small number of rows from the dataset will no impact the prediction results greatly 
and so we will drop those rows 

In [27]:
# we assume that ages greater then 130 are probubly errors in the data and will be removed 
df = df.drop(df[df["age"] > 130].index)
# chceck to see max age again after changes 
df["age"].max()


97

In [28]:
# replace different diagnosis with a mapping of 3 bins 

normal = "normal"
hyperfunction = "hyperfunction"
subnormal = "subnormal"
diagnosis_mapping = {"A":hyperfunction,
            "B":hyperfunction,
            "C":hyperfunction,
            "D":hyperfunction,
            "O":hyperfunction,
            "P":hyperfunction,
            "Q":hyperfunction,
            "T":hyperfunction,
            "E":subnormal,
            "F":subnormal,
            "G":subnormal,
            "H":subnormal,
            "L":subnormal,
            "M":subnormal,
            "N":subnormal,
            "K":normal,
            "I":normal,
            "J":normal,
            "S":normal,
            "-":normal,
            normal : normal,
            hyperfunction : hyperfunction, 
            subnormal : subnormal
          }

def clearify_diagnosis(diagnosis):
    """
    this function removes conflicting diagnosis and diagnosis which are not conclusive 
    """
    if diagnosis in diagnosis_mapping:
        return diagnosis_mapping.get(diagnosis)
    
    curr_mapping = None
    
    for letter in diagnosis:
        if letter == "|":
            continue
        
        if letter not in diagnosis_mapping :
            return pd.NA  # Drop the row if any letter is not in the mapping

        curr_letter_mapping = diagnosis_mapping.get(letter) 
        if curr_mapping == None:
            curr_mapping = curr_letter_mapping
        elif curr_letter_mapping != curr_mapping:
            return pd.NA # Drop row if conflicting diagnosis
        
    # if all possible diagnosis have the same mapping then return the mapping  
    return curr_mapping

df["diagnosis"] = df["diagnosis"].apply(clearify_diagnosis)
df.dropna(subset=["diagnosis"], inplace=True)

df["diagnosis"].unique()

array(['normal', 'subnormal', 'hyperfunction'], dtype=object)

In [29]:
# noww that we have finnished dealing with the data will encode all the categorical data to integeres 
# and the pd.NA to -999 
encoding_map = {
    "F":0,
    "M":1,
    "f":0,
    "t":1,
    pd.NA:-999,
    normal : 0,
    hyperfunction : 1, 
    subnormal : 2
}

def encode(value):
    return encoding_map.get(value,value)

df = df.applymap(encode)
df["diagnosis"].unique()

array([0, 2, 1])

In [30]:
df.to_csv("pre_proc.csv", index=False)

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [34]:


X = df.drop("diagnosis",axis=1)
Y = df["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
log_regression = LogisticRegression(max_iter=10000, random_state=42)
log_regression.fit(X_train, y_train)
log_regression_y_pred = log_regression.predict(X_test)


In [35]:
accuracy = accuracy_score(y_test, log_regression_y_pred)
print ("Logistic Regression ")
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, log_regression_y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, log_regression_y_pred))

Logistic Regression 
Accuracy: 0.8866213151927438
Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1539
           1       0.71      0.23      0.35        43
           2       0.95      0.10      0.19       182

    accuracy                           0.89      1764
   macro avg       0.85      0.44      0.49      1764
weighted avg       0.89      0.89      0.85      1764

Confusion Matrix:
[[1535    3    1]
 [  33   10    0]
 [ 162    1   19]]


In [36]:
print(X_test.shape[0])

1764


In [37]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)
random_forest_y_pred = random_forest.predict(X_test)

In [38]:
accuracy = accuracy_score(y_test, random_forest_y_pred)
print ("Random Forest ")
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, random_forest_y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, random_forest_y_pred))

Random Forest 
Accuracy: 0.9863945578231292
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1539
           1       0.89      0.74      0.81        43
           2       0.95      0.98      0.96       182

    accuracy                           0.99      1764
   macro avg       0.94      0.91      0.92      1764
weighted avg       0.99      0.99      0.99      1764

Confusion Matrix:
[[1529    3    7]
 [   8   32    3]
 [   2    1  179]]


In [39]:
# hyperfunction = [ A, B, C, D, O, P, Q, T ]
# subnormal = [ E, F, G, H, L, M, N ]
# normal = [ K, I, J, S, - ]

SyntaxError: invalid syntax (2344834862.py, line 3)