In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import RocCurveDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

In [3]:
df = pd.read_csv('/content/cleaned_dataset.csv')

In [4]:
df.head(10)

Unnamed: 0,Patient_ID,Month_of_Birth,Age,Sex,Occupation,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months,Mortality_Status
0,12,12,68.0,1,36,0,0,0,1,3,1,4.0,1,1,24.0,1,60,0
1,13,12,50.0,1,23,1,1,2,0,2,1,35.0,1,1,14.0,5,62,0
2,14,11,58.0,1,10,2,2,4,0,2,1,63.0,1,1,14.0,7,75,0
3,15,3,58.0,1,37,0,0,0,1,3,1,18.0,1,1,2.0,1,84,0
4,16,1,47.0,1,25,1,0,1,1,3,1,41.0,1,1,3.0,1,50,0
5,17,2,51.0,1,32,0,0,0,0,2,1,20.0,1,1,18.0,2,89,0
6,18,5,51.0,1,15,0,0,0,3,1,1,8.0,1,1,11.0,1,54,0
7,19,4,40.0,1,31,1,0,1,0,2,1,30.0,1,1,9.0,1,14,1
8,20,6,40.0,1,8,3,2,4,1,3,1,103.0,1,1,20.0,18,70,0
9,21,4,69.0,1,5,3,2,4,3,1,0,32.0,1,1,21.0,12,92,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Patient_ID              4024 non-null   int64  
 1   Month_of_Birth          4024 non-null   int64  
 2   Age                     4024 non-null   float64
 3   Sex                     4024 non-null   int64  
 4   Occupation              4024 non-null   int64  
 5   T_Stage                 4024 non-null   int64  
 6   N_Stage                 4024 non-null   int64  
 7   6th_Stage               4024 non-null   int64  
 8   Differentiated          4024 non-null   int64  
 9   Grade                   4024 non-null   int64  
 10  A_Stage                 4024 non-null   int64  
 11  Tumor_Size              4024 non-null   float64
 12  Estrogen_Status         4024 non-null   int64  
 13  Progesterone_Status     4024 non-null   int64  
 14  Regional_Node_Examined  4024 non-null   

In [6]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

First step towards building the Single Linear Regression (SLR) model is to instantiate it.

In [7]:
feature_cols = ['Month_of_Birth', 'Age', 'Sex', 'T_Stage','N_Stage', '6th_Stage', 'Differentiated', 'Grade', 'A_Stage', 'Tumor_Size', 'Estrogen_Status', 'Progesterone_Status', 'Regional_Node_Examined', 'Regional_Node_Positive', 'Survival_Months']
X = df[feature_cols] # Features
y = df['Mortality_Status']

KeyError: "['Regional_Node_Positive'] not in index"

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y)

In [None]:
 #Logistic Regression Model
 logreg = LogisticRegression()
 logreg.fit(X_train, y_train)

In [None]:
y_pred_train=logreg.predict(X_train)
y_pred = logreg.predict(X_test)

In [None]:
print("Logistic Regression Accuracy (Train):", accuracy_score(y_train, y_pred_train))
print("Logistic Regression Accuracy (Test):", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels = logreg.classes_)
disp = ConfusionMatrixDisplay(cm, display_labels = logreg.classes_)
disp.plot()

In [None]:
logreg = RocCurveDisplay.from_estimator(logreg, X_test, y_test)

**KNN Model**

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)

In [None]:
knn.fit(X_train,y_train)

In [None]:
y_pred_knn = knn.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print ("The accuracy is: ", accuracy)

In [None]:
print(classification_report(y_test, y_pred))

**Naive Bayes Model**

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
y_pred_nb = nb.predict(X_test)

In [None]:
print("Naive Bayes Accuracy (Test):", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

In [None]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
logreg_gscv = GridSearchCV(LogisticRegression(), param_grid, cv=5)
logreg_gscv.fit(X_train, y_train)
print("Best Parameters for Logistic Regression:", logreg_gscv.best_params_)

In [None]:
y_pred_gscv = logreg_gscv.predict(X_test)
print("Optimized Logistic Regression Accuracy (Test):", accuracy_score(y_test, y_pred_gscv))
