In [40]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import RandomizedSearchCV
import pickle

In [41]:
df=pd.read_csv('filtered_thyroid_data.csv')
df.head()

Unnamed: 0,Age,Gender,Hx Radiothreapy,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              383 non-null    int64 
 1   Gender           383 non-null    object
 2   Hx Radiothreapy  383 non-null    object
 3   Adenopathy       383 non-null    object
 4   Pathology        383 non-null    object
 5   Focality         383 non-null    object
 6   Risk             383 non-null    object
 7   T                383 non-null    object
 8   N                383 non-null    object
 9   M                383 non-null    object
 10  Stage            383 non-null    object
 11  Response         383 non-null    object
 12  Recurred         383 non-null    object
dtypes: int64(1), object(12)
memory usage: 39.0+ KB


In [43]:
df.tail()

Unnamed: 0,Age,Gender,Hx Radiothreapy,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
378,72,M,Yes,Right,Papillary,Uni-Focal,High,T4b,N1b,M1,IVB,Biochemical Incomplete,Yes
379,81,M,Yes,Extensive,Papillary,Multi-Focal,High,T4b,N1b,M1,IVB,Structural Incomplete,Yes
380,72,M,No,Bilateral,Papillary,Multi-Focal,High,T4b,N1b,M1,IVB,Structural Incomplete,Yes
381,61,M,Yes,Extensive,Hurthel cell,Multi-Focal,High,T4b,N1b,M0,IVA,Structural Incomplete,Yes
382,67,M,No,Bilateral,Papillary,Multi-Focal,High,T4b,N1b,M0,IVA,Structural Incomplete,Yes


In [44]:
scaler = StandardScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

In [45]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['Gender'] = encoder.fit_transform(df['Gender'])

In [46]:
df['Hx Radiothreapy'] = df['Hx Radiothreapy'].map({'No': 0, 'Yes': 1})

In [47]:
df['Focality'] = df['Focality'].map({'Uni-Focal': 0, 'Multi-Focal': 1})

In [48]:
df['T'] = df['T'].map({'T1a': 1, 'T1b': 2, 'T2': 3})
df['N'] = df['N'].map({'N0': 0, 'N1': 1})
df['M'] = df['M'].map({'M0': 0, 'M1': 1})

In [49]:
df['Stage'] = df['Stage'].map({'I': 1, 'II': 2, 'III': 3, 'IV': 4})

In [50]:
df['Response'] = df['Response'].map({'Indeterminate': 0, 'Excellent': 1})

In [51]:
df['Risk'] = df['Risk'].map({'Low': 0, 'Medium': 1, 'High': 2})

In [52]:
df['Recurred'] = df['Recurred'].map({'No': 0, 'Yes': 1})

In [53]:
df.isnull().sum()

Age                  0
Gender               0
Hx Radiothreapy      0
Adenopathy           0
Pathology            0
Focality             0
Risk               102
T                  140
N                  115
M                    0
Stage               14
Response           114
Recurred             0
dtype: int64

In [54]:
df=df.drop(columns=['Adenopathy','Pathology'],errors='ignore')

In [55]:
df = df.dropna()

In [56]:
df.isnull().sum()

Age                0
Gender             0
Hx Radiothreapy    0
Focality           0
Risk               0
T                  0
N                  0
M                  0
Stage              0
Response           0
Recurred           0
dtype: int64

In [57]:
df.tail()

Unnamed: 0,Age,Gender,Hx Radiothreapy,Focality,Risk,T,N,M,Stage,Response,Recurred
196,-1.248243,0,0,0,0.0,3.0,0.0,0,1.0,1.0,0
197,-0.652797,0,0,0,0.0,3.0,0.0,0,1.0,1.0,0
198,1.795148,0,0,0,0.0,3.0,0.0,0,1.0,1.0,0
199,1.06738,1,0,0,0.0,3.0,0.0,0,1.0,1.0,0
222,-0.388154,0,0,0,0.0,3.0,0.0,0,1.0,0.0,0


In [58]:
X=df.drop('Response',axis=1) ## Independent Feature
X

Unnamed: 0,Age,Gender,Hx Radiothreapy,Focality,Risk,T,N,M,Stage,Recurred
0,-0.917439,0,0,0,0.0,1.0,0.0,0,1.0,0
1,-0.454315,0,0,0,0.0,1.0,0.0,0,1.0,0
2,-0.718957,0,0,0,0.0,1.0,0.0,0,1.0,0
3,1.398184,0,0,0,0.0,1.0,0.0,0,1.0,0
4,1.398184,0,0,1,0.0,1.0,0.0,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...
196,-1.248243,0,0,0,0.0,3.0,0.0,0,1.0,0
197,-0.652797,0,0,0,0.0,3.0,0.0,0,1.0,0
198,1.795148,0,0,0,0.0,3.0,0.0,0,1.0,0
199,1.067380,1,0,0,0.0,3.0,0.0,0,1.0,0


In [59]:
y=df['Response'] ## Dependent Features
y.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: Response, dtype: float64

In [60]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [61]:
X_train

Unnamed: 0,Age,Gender,Hx Radiothreapy,Focality,Risk,T,N,M,Stage,Recurred
135,-1.380564,0,0,0,0.0,3.0,0.0,0,1.0,0
148,-0.520475,0,0,0,0.0,3.0,0.0,0,1.0,0
164,-0.785118,0,0,1,0.0,3.0,0.0,0,1.0,0
31,-0.718957,0,0,0,0.0,1.0,0.0,0,1.0,0
12,0.538095,0,0,0,0.0,1.0,0.0,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...
116,-0.520475,0,0,0,0.0,3.0,0.0,0,1.0,0
14,2.324433,0,0,0,0.0,1.0,0.0,0,1.0,0
101,0.074971,0,0,0,0.0,3.0,0.0,0,1.0,0
194,-0.983600,0,0,0,0.0,3.0,0.0,0,1.0,0


In [62]:
logistic=LogisticRegression()
logistic.fit(X_train,y_train)

In [63]:
y_pred=logistic.predict(X_test)
y_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1.])

In [79]:
score=accuracy_score(y_pred,y_test)
score

0.9473684210526315

In [80]:
cm=confusion_matrix(y_pred,y_test)
cm

array([[ 0,  0],
       [ 2, 36]])

In [81]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.95      0.97        38

    accuracy                           0.95        38
   macro avg       0.50      0.47      0.49        38
weighted avg       1.00      0.95      0.97        38



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [82]:
model=LogisticRegression()
penalty=['l1','l2','elasticnet']
c_values=[0.1,1,10,100]
solver=['newton-cg','liblinear','sag','saga']
params=dict(penalty=penalty,C=c_values,solver=solver)

In [85]:
print(model.fit(X_train,y_train))

LogisticRegression()


In [86]:
print(model.predict(X_test))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [68]:
randomcv=RandomizedSearchCV(estimator=model,param_distributions=params,cv=10,scoring='accuracy')

In [69]:
randomcv.fit(X_train,y_train)

30 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\furkh\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\furkh\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\furkh\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(se

In [70]:
randomcv.best_params_

{'solver': 'liblinear', 'penalty': 'l2', 'C': 0.1}

In [71]:
randomcv.best_score_

np.float64(0.8790476190476191)

In [72]:
grid=GridSearchCV(estimator=model,param_grid=params,cv=5,scoring='accuracy')

In [73]:
grid.fit(X_train,y_train)

120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\furkh\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\furkh\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\furkh\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(s

In [74]:
grid.best_score_

np.float64(0.8786206896551725)

In [75]:
grid.best_params_

{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

In [78]:
y_pred = logistic.predict(X_test)
y_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1.])

In [37]:
pickle.dump(logistic,open('logistic.pkl','wb'))