# import libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report,confusion_matrix

# load datasets

In [5]:
df = pd.read_excel('Combined_Thyroid.xlsx')


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4201 entries, 0 to 4200
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4201 non-null   int64  
 1   Sex                      4201 non-null   int64  
 2   On_thyroxine             4201 non-null   int64  
 3   Query_on_thyroxine       4201 non-null   int64  
 4   On_antithyroid_medicine  4201 non-null   int64  
 5   Sick                     4201 non-null   int64  
 6   Pregnant                 4201 non-null   int64  
 7   Thyroid_surgery          4201 non-null   int64  
 8   I131_treatment           4201 non-null   int64  
 9   Query_hypothyroid        4201 non-null   int64  
 10  Query_hyperthyroid       4201 non-null   int64  
 11  Lithium                  4201 non-null   int64  
 12  Goitre                   4201 non-null   int64  
 13  Tumor                    4201 non-null   int64  
 14  Hypopituitary           

In [7]:
df.head()

Unnamed: 0,Age,Sex,On_thyroxine,Query_on_thyroxine,On_antithyroid_medicine,Sick,Pregnant,Thyroid_surgery,I131_treatment,Query_hypothyroid,...,Goitre,Tumor,Hypopituitary,Pysch,TSH,T3,TT4,T4U,FTI,Class
0,5,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0.061,0.0096,0.013,0.116,0.011,1
1,78,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0.025,0.009,0.05,0.084,0.06,1
2,73,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.047,0.011,0.052,0.09,0.058,1
3,43,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.07,0.005,0.0029,0.104,0.0028,1
4,42,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.031,0.008,0.013,0.119,0.011,1


In [8]:
df.isnull().sum()

Age                        0
Sex                        0
On_thyroxine               0
Query_on_thyroxine         0
On_antithyroid_medicine    0
Sick                       0
Pregnant                   0
Thyroid_surgery            0
I131_treatment             0
Query_hypothyroid          0
Query_hyperthyroid         0
Lithium                    0
Goitre                     0
Tumor                      0
Hypopituitary              0
Pysch                      0
TSH                        0
T3                         0
TT4                        0
T4U                        0
FTI                        0
Class                      0
dtype: int64

In [9]:
df.columns.tolist()

['Age',
 'Sex',
 'On_thyroxine',
 'Query_on_thyroxine',
 'On_antithyroid_medicine',
 'Sick',
 'Pregnant',
 'Thyroid_surgery',
 'I131_treatment',
 'Query_hypothyroid',
 'Query_hyperthyroid',
 'Lithium',
 'Goitre',
 'Tumor',
 'Hypopituitary',
 'Pysch',
 'TSH',
 'T3',
 'TT4',
 'T4U',
 'FTI',
 'Class']

In [10]:
X1 = df[['Age',
 'Sex',
 'On_thyroxine',
 'Query_on_thyroxine',
 'On_antithyroid_medicine',
 'Sick',
 'Pregnant',
 'Thyroid_surgery',
 'I131_treatment',
 'Query_hypothyroid',
 'Query_hyperthyroid',
 'Lithium',
 'Goitre',
 'Tumor',
 'Hypopituitary',
 'Pysch',
 'TSH',
 'T3',
 'TT4',
 'T4U',
 'FTI']].values
Y1 = df['Class'].values

# Pass 1

In [11]:
X = np.array(X1)  # input features
y = np.array(Y1)  # output labels

# Define the number of partitions
n_splits = 5

# Shuffle the data randomly
np.random.seed(42)
indices = np.random.permutation(len(X))
X = X[indices]
y = y[indices]

# Create the KFold object to generate the partitions
kf = KFold(n_splits=n_splits)

# Train and test the random forest model on each partition
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Partition {i+1}:")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the random forest model on the training set
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Test the random forest model on the test set
    y_pred = model.predict(X_test)
    predicted_integers = [int(round(value)) for value in y_pred]

    print(classification_report(y_test,predicted_integers))

    
    print('-'*25)


Partition 1:


              precision    recall  f1-score   support

           0       0.95      1.00      0.98        59
           1       1.00      1.00      1.00       684
           2       0.94      1.00      0.97        16
           3       1.00      0.99      0.99        82

    accuracy                           1.00       841
   macro avg       0.97      1.00      0.98       841
weighted avg       1.00      1.00      1.00       841

-------------------------
Partition 2:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97        57
           1       1.00      1.00      1.00       705
           2       0.64      1.00      0.78         7
           3       1.00      0.94      0.97        71

    accuracy                           0.99       840
   macro avg       0.90      0.98      0.93       840
weighted avg       0.99      0.99      0.99       840

-------------------------
Partition 3:
              precision    recall  f1-score   suppor

# Pass 2

In [12]:
# Load your dataset here
X = np.array(X1)  # input features
y = np.array(Y1)  # output labels

# Define the number of partitions
n_splits = 5

# Shuffle the data randomly
np.random.seed(42)
indices = np.random.permutation(len(X))
# print(indices)
X = X[indices]
y = y[indices]

# Create the KFold object to generate the partitions
kf = KFold(n_splits=n_splits)

# Train and test the model on different partitions
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Partition {i+1}:")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the machine learning model on the training set
    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    # Test the machine learning model on the remaining partitions
    for j, (train_index2, test_index2) in enumerate(kf.split(X)):
        if j != i:
            X_train2, X_test2 = X[train_index2], X[test_index2]
            y_train2, y_test2 = y[train_index2], y[test_index2]
            
            y_pred = model.predict(X_test2)
            predicted_integers = [int(round(value)) for value in y_pred]
            print(classification_report(y_test2,predicted_integers))      
            print("-"*50)
    print('-'*55)

Partition 1:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        57
           1       1.00      1.00      1.00       705
           2       1.00      1.00      1.00         7
           3       1.00      1.00      1.00        71

    accuracy                           1.00       840
   macro avg       1.00      1.00      1.00       840
weighted avg       1.00      1.00      1.00       840

--------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      1.00      1.00       722
           2       1.00      1.00      1.00         7
           3       1.00      1.00      1.00        57

    accuracy                           1.00       840
   macro avg       1.00      1.00      1.00       840
weighted avg       1.00      1.00      1.00       840

--------------------------------------------------
              p

# Pass 1 with confusion matrix

In [13]:
dff = pd.DataFrame(columns=['Partition','Precision','recall','F-score','FPR','Accuracy','Sensitivity','Specificity']) # creating empty data frame with some column

In [14]:

# Load your dataset here
X = np.array(X1)  # input features
y = np.array(Y1)  # output labels

# Define the number of partitions
n_splits = 5

# Shuffle the data randomly
np.random.seed(42)
indices = np.random.permutation(len(X))
X = X[indices]
y = y[indices]

# Create the KFold object to generate the partitions
kf = KFold(n_splits=n_splits)

# Train and test the random forest model on each partition
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Partition {i+1}:")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the random forest model on the training set
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Test the random forest model on the test set
    y_pred = model.predict(X_test)
    predicted_integers = [int(round(value)) for value in y_pred]

    a = confusion_matrix(y_test,predicted_integers)

    precision = a[0][0]/(a[0][0]+a[0][1])
    recall = a[0][0]/(a[0][0]+a[1][0])
    F_score = (2*(precision*recall))/(precision+recall)
    Fal_ps_r = a[0][1]/(a[0][1]+a[1][1])
    Accuracy = (a[0][0]+a[1][1])/(a[0][0]+a[0][1]+a[1][0]+a[1][1])
    Sensitivity = a[0][0]/(a[0][0]+a[1][0])
    Specificity = a[1][1]/(a[1][1]+a[0][1])

    row = [i+1,precision,recall,F_score,Fal_ps_r,Accuracy,Sensitivity,Specificity]
    print(f"precision: {precision}")
    print(f"recall: {recall}")
    print(f"F-score: {F_score}")
    print(f"FPR:{Fal_ps_r} ")
    print(f"Accuracy: {Accuracy}")
    print(f"Sensitivity: {Sensitivity}")
    print(f"Specificity: {Specificity}")

    dff.loc[len(dff)] = row 

    
    print('-'*25)


Partition 1:
precision: 1.0
recall: 0.9516129032258065
F-score: 0.9752066115702479
FPR:0.0 
Accuracy: 0.9959623149394348
Sensitivity: 0.9516129032258065
Specificity: 1.0
-------------------------
Partition 2:
precision: 0.9824561403508771
recall: 0.9655172413793104
F-score: 0.9739130434782608
FPR:0.0014204545454545455 
Accuracy: 0.9960629921259843
Sensitivity: 0.9655172413793104
Specificity: 0.9985795454545454
-------------------------
Partition 3:
precision: 1.0
recall: 0.9818181818181818
F-score: 0.9908256880733944
FPR:0.0 
Accuracy: 0.9987113402061856
Sensitivity: 0.9818181818181818
Specificity: 1.0
-------------------------
Partition 4:
precision: 0.9692307692307692
recall: 1.0
F-score: 0.9843749999999999
FPR:0.0029282576866764276 
Accuracy: 0.9973190348525469
Sensitivity: 1.0
Specificity: 0.9970717423133236
-------------------------
Partition 5:
precision: 0.9821428571428571
recall: 0.9821428571428571
F-score: 0.9821428571428571
FPR:0.0013966480446927375 
Accuracy: 0.9974093264248

# Pass 2 with confusion matrix

In [15]:
df3 = pd.DataFrame(columns=['Partition','Sub partition','Precision','recall','F-score','FPR','Accuracy','Sensitivity','Specificity']) # creating an empty Dataframe with some columns


In [16]:

# Load your dataset here
X = np.array(X1)  # input features
y = np.array(Y1)  # output labels

# Define the number of partitions
n_splits = 5

# Shuffle the data randomly
np.random.seed(42)
indices = np.random.permutation(len(X))
# print(indices)
X = X[indices]
y = y[indices]

# Create the KFold object to generate the partitions
kf = KFold(n_splits=n_splits)

# Train and test the model on different partitions
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Partition {i+1}:")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the machine learning model on the training set
    model = RandomForestRegressor(n_estimators=200, random_state=20)
    model.fit(X_train, y_train)

    # Test the machine learning model on the remaining partitions
    for j, (train_index2, test_index2) in enumerate(kf.split(X)):
        if j != i:
            X_train2, X_test2 = X[train_index2], X[test_index2]
            y_train2, y_test2 = y[train_index2], y[test_index2]
            
            y_pred = model.predict(X_test2)
            predicted_integers = [int(round(value)) for value in y_pred]
            a = confusion_matrix(y_test2,predicted_integers)

            
            precision = a[0][0]/(a[0][0]+a[0][1])
            recall = a[0][0]/(a[0][0]+a[1][0])
            F_score = (2*(precision*recall))/(precision+recall)
            Fal_ps_r = a[0][1]/(a[0][1]+a[1][1])
            Accuracy = (a[0][0]+a[1][1])/(a[0][0]+a[0][1]+a[1][0]+a[1][1])
            Sensitivity = a[0][0]/(a[0][0]+a[1][0])
            Specificity = a[1][1]/(a[1][1]+a[0][1]) 

            print(f"precision: {precision}")
            print(f"recall: {recall}")
            print(f"F-score: {F_score}")
            print(f"FPR:{Fal_ps_r} ")
            print(f"Accuracy: {Accuracy}")
            print(f"Sensitivity: {Sensitivity}")
            print(f"Specificity: {Specificity}") 
            row = [i+1,j+1,precision,recall,F_score,Fal_ps_r,Accuracy,Sensitivity,Specificity]
            df3.loc[len(df3)] = row # inserting evaluation data in a dataframe
            print("-"*50)
    print('-'*55)

Partition 1:
precision: 1.0
recall: 1.0
F-score: 1.0
FPR:0.0 
Accuracy: 1.0
Sensitivity: 1.0
Specificity: 1.0
--------------------------------------------------
precision: 1.0
recall: 1.0
F-score: 1.0
FPR:0.0 
Accuracy: 1.0
Sensitivity: 1.0
Specificity: 1.0
--------------------------------------------------
precision: 1.0
recall: 1.0
F-score: 1.0
FPR:0.0 
Accuracy: 1.0
Sensitivity: 1.0
Specificity: 1.0
--------------------------------------------------
precision: 1.0
recall: 1.0
F-score: 1.0
FPR:0.0 
Accuracy: 1.0
Sensitivity: 1.0
Specificity: 1.0
--------------------------------------------------
-------------------------------------------------------
Partition 2:
precision: 1.0
recall: 1.0
F-score: 1.0
FPR:0.0 
Accuracy: 1.0
Sensitivity: 1.0
Specificity: 1.0
--------------------------------------------------
precision: 1.0
recall: 1.0
F-score: 1.0
FPR:0.0 
Accuracy: 1.0
Sensitivity: 1.0
Specificity: 1.0
--------------------------------------------------
precision: 1.0
recall: 1.0
F-s

In [17]:
dff

Unnamed: 0,Partition,Precision,recall,F-score,FPR,Accuracy,Sensitivity,Specificity
0,1.0,1.0,0.951613,0.975207,0.0,0.995962,0.951613,1.0
1,2.0,0.982456,0.965517,0.973913,0.00142,0.996063,0.965517,0.99858
2,3.0,1.0,0.981818,0.990826,0.0,0.998711,0.981818,1.0
3,4.0,0.969231,1.0,0.984375,0.002928,0.997319,1.0,0.997072
4,5.0,0.982143,0.982143,0.982143,0.001397,0.997409,0.982143,0.998603


In [18]:
df3

Unnamed: 0,Partition,Sub partition,Precision,recall,F-score,FPR,Accuracy,Sensitivity,Specificity
0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
1,1.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
2,1.0,4.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
3,1.0,5.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
4,2.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
5,2.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
6,2.0,4.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
7,2.0,5.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
8,3.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
9,3.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0


# training whole combined dataset

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train,X_test,Y_train,Y_test = train_test_split(X1,Y1,test_size=0.2,random_state=0)

In [21]:
rfc = RandomForestRegressor(n_estimators=200)

In [22]:
rfc.fit(X_train,Y_train)

In [24]:
pred = rfc.predict(X_test)
predicted_integers = [int(round(value)) for value in pred]

In [25]:
print(classification_report(Y_test,predicted_integers))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95        64
           1       1.00      1.00      1.00       701
           2       1.00      1.00      1.00        12
           3       1.00      1.00      1.00        64

    accuracy                           0.99       841
   macro avg       0.99      0.99      0.99       841
weighted avg       0.99      0.99      0.99       841

