Fathima Afra
W2053240

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, RocCurveDisplay,precision_score,recall_score,f1_score
import matplotlib.pyplot as plt


In [2]:

# Setting display options for better output readability
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)


In [3]:

# Let's assume the cleaned data is stored in a file named 'cleaned_cancer_data.csv'
# Adjust the file path according to your setup
data = pd.read_csv('/content/classification_data_prepared (11).csv')

In [4]:
# Display the first 15 rows to understand the data
print("First 15 rows of the dataset:")
print(data.head(15))

First 15 rows of the dataset:
         Age   T_Stage   N_Stage  6th_Stage  Differentiated     Grade   A_Stage  Tumor_Size  Estrogen_Status  Progesterone_Status  Survival_Months  \
0   1.559995 -1.025665 -0.631802  -1.043772        0.304924  1.330750  0.151484   -1.427014         0.268081             0.457741        -0.492364   
1  -0.444285  0.281919  0.811700   0.536758       -0.679271 -0.236309  0.151484    0.321410         0.268081             0.457741        -0.405136   
2   0.446506  1.589504  2.255203   2.117288       -0.679271 -0.236309  0.151484    1.900631         0.268081             0.457741         0.161846   
3   0.446506 -1.025665 -0.631802  -1.043772        0.304924  1.330750  0.151484   -0.637403         0.268081             0.457741         0.554372   
4  -0.778332  0.281919 -0.631802  -0.253507        0.304924  1.330750  0.151484    0.659814         0.268081             0.457741        -0.928504   
5  -0.332936 -1.025665 -0.631802  -1.043772       -0.679271 -0.236309 

In [5]:
# Display information about the dataset
print("\nDataset Information:")
data.info()



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4012 entries, 0 to 4011
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    4012 non-null   float64
 1   T_Stage                4012 non-null   float64
 2   N_Stage                4012 non-null   float64
 3   6th_Stage              4012 non-null   float64
 4   Differentiated         4012 non-null   float64
 5   Grade                  4012 non-null   float64
 6   A_Stage                4012 non-null   float64
 7   Tumor_Size             4012 non-null   float64
 8   Estrogen_Status        4012 non-null   float64
 9   Progesterone_Status    4012 non-null   float64
 10  Survival_Months        4012 non-null   float64
 11  Mortality_Status       4003 non-null   float64
 12  Node_Positivity_Ratio  4012 non-null   float64
dtypes: float64(13)
memory usage: 407.6 KB


In [6]:
# Define features and target variable
# Based on your screenshot, these are the columns to use
feature_cols = ['Age', 'T_Stage', 'N_Stage', '6th_Stage', 'Differentiated', 'Grade',
                'A_Stage', 'Tumor_Size', 'Estrogen_Status', 'Progesterone_Status',
                'Node_Positivity_Ratio', 'Survival_Months']

In [7]:
X = data[feature_cols]  # Features
y = data['Mortality_Status']  # Target variable


In [8]:
print("\nFeature shape:", X.shape)
print("Target shape:", y.shape)


Feature shape: (4012, 12)
Target shape: (4012,)


In [9]:
# Check target variable distribution
print("\nTarget variable distribution:")
print(y.value_counts())
print(y.value_counts(normalize=True).round(3)*100, "% of data")
y = y.fillna(y.mode()[0])
# Split the data into training and testing sets
# Using stratified sampling to maintain the same class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


Target variable distribution:
Mortality_Status
0.0    3397
1.0     606
Name: count, dtype: int64
Mortality_Status
0.0    84.9
1.0    15.1
Name: proportion, dtype: float64 % of data


In [10]:
print("\nTraining set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (3209, 12)
Testing set shape: (803, 12)


In [11]:
# Check if the stratification maintained the class distribution
print("\nTraining set target distribution:")
print(y_train.value_counts(normalize=True).round(3)*100, "% of data")
print("\nTesting set target distribution:")
print(y_test.value_counts(normalize=True).round(3)*100, "% of data")



Training set target distribution:
Mortality_Status
0.0    84.9
1.0    15.1
Name: proportion, dtype: float64 % of data

Testing set target distribution:
Mortality_Status
0.0    84.9
1.0    15.1
Name: proportion, dtype: float64 % of data


In [12]:

# -------------------------------------------------------------
# 1. Logistic Regression Model
# -------------------------------------------------------------
print("\n\n" + "="*50)
print("LOGISTIC REGRESSION MODEL")
print("="*50)



LOGISTIC REGRESSION MODEL


In [13]:
# Initialize and train the logistic regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

In [14]:
# Predictions on training and test sets
y_pred_train_logreg = logreg.predict(X_train)
y_pred_logreg = logreg.predict(X_test)

In [15]:
# Create a comparison DataFrame for test predictions
comparison_df_logreg = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_logreg})
print("\nSample of actual vs predicted values (Logistic Regression):")
print(comparison_df_logreg.head(10))


Sample of actual vs predicted values (Logistic Regression):
      Actual  Predicted
706      0.0        0.0
1247     0.0        0.0
185      0.0        0.0
2274     0.0        0.0
229      0.0        0.0
3242     0.0        0.0
21       0.0        0.0
2525     0.0        0.0
3428     0.0        0.0
1842     1.0        0.0


In [16]:
# Calculate accuracy on training set
accuracy_train_logreg = accuracy_score(y_train, y_pred_train_logreg)
print("\nLogistic Regression training accuracy:", accuracy_train_logreg)


Logistic Regression training accuracy: 0.8965409784979744


In [17]:
# Calculate accuracy on test set
accuracy_test_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression test accuracy:", accuracy_test_logreg)


Logistic Regression test accuracy: 0.9028642590286425


In [18]:
# Confusion Matrix
print("\nConfusion Matrix (Logistic Regression):")
cm_logreg = confusion_matrix(y_test, y_pred_logreg, labels=logreg.classes_)
print(cm_logreg)


Confusion Matrix (Logistic Regression):
[[668  14]
 [ 64  57]]


In [19]:
plt.figure(figsize=(8, 6))
disp_logreg = ConfusionMatrixDisplay(cm_logreg, display_labels=logreg.classes_)
disp_logreg.plot()
plt.title('Confusion Matrix - Logistic Regression')
plt.savefig('logreg_confusion_matrix.png')
plt.close()

<Figure size 800x600 with 0 Axes>

In [20]:
# Classification Report
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_logreg))


Classification Report (Logistic Regression):
              precision    recall  f1-score   support

         0.0       0.91      0.98      0.94       682
         1.0       0.80      0.47      0.59       121

    accuracy                           0.90       803
   macro avg       0.86      0.73      0.77       803
weighted avg       0.90      0.90      0.89       803



In [21]:
# ROC Curve
plt.figure(figsize=(8, 6))
roc_logreg = RocCurveDisplay.from_estimator(logreg, X_test, y_test)
plt.title('ROC Curve - Logistic Regression')
plt.savefig('logreg_roc_curve.png')
plt.close()

<Figure size 800x600 with 0 Axes>

In [22]:

# -------------------------------------------------------------
# 2. Naive Bayes Model
# -------------------------------------------------------------
print("\n\n" + "="*50)
print("NAIVE BAYES MODEL")
print("="*50)



NAIVE BAYES MODEL


In [23]:
# Initialize and train the Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)

In [24]:
# Predictions on training and test sets
y_pred_train_nb = nb.predict(X_train)
y_pred_nb = nb.predict(X_test)

In [25]:
# Create a comparison DataFrame for test predictions
comparison_df_nb = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_nb})
print("\nSample of actual vs predicted values (Naive Bayes):")
print(comparison_df_nb.head(10))


Sample of actual vs predicted values (Naive Bayes):
      Actual  Predicted
706      0.0        0.0
1247     0.0        0.0
185      0.0        0.0
2274     0.0        1.0
229      0.0        0.0
3242     0.0        0.0
21       0.0        0.0
2525     0.0        0.0
3428     0.0        0.0
1842     1.0        0.0


In [26]:

# Calculate accuracy on training set
accuracy_train_nb = accuracy_score(y_train, y_pred_train_nb)
print("\nNaive Bayes training accuracy:", accuracy_train_nb)


Naive Bayes training accuracy: 0.824555936428794


In [27]:
# Calculate accuracy on test set
accuracy_test_nb = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes test accuracy:", accuracy_test_nb)

Naive Bayes test accuracy: 0.8044831880448319


In [28]:

# Confusion Matrix
print("\nConfusion Matrix (Naive Bayes):")
cm_nb = confusion_matrix(y_test, y_pred_nb, labels=nb.classes_)
print(cm_nb)


Confusion Matrix (Naive Bayes):
[[588  94]
 [ 63  58]]


In [29]:
plt.figure(figsize=(8, 6))
disp_nb = ConfusionMatrixDisplay(cm_nb, display_labels=nb.classes_)
disp_nb.plot()
plt.title('Confusion Matrix - Naive Bayes')
plt.savefig('nb_confusion_matrix.png')
plt.close()

<Figure size 800x600 with 0 Axes>

In [30]:
# Classification Report
print("\nClassification Report (Naive Bayes):")
print(classification_report(y_test, y_pred_nb))


Classification Report (Naive Bayes):
              precision    recall  f1-score   support

         0.0       0.90      0.86      0.88       682
         1.0       0.38      0.48      0.42       121

    accuracy                           0.80       803
   macro avg       0.64      0.67      0.65       803
weighted avg       0.82      0.80      0.81       803



In [31]:

# ROC Curve
plt.figure(figsize=(8, 6))
roc_nb = RocCurveDisplay.from_estimator(nb, X_test, y_test)
plt.title('ROC Curve - Naive Bayes')
plt.savefig('nb_roc_curve.png')
plt.close()

<Figure size 800x600 with 0 Axes>

In [32]:
# -------------------------------------------------------------
# 3. K-Nearest Neighbors Model
# -------------------------------------------------------------
print("\n\n" + "="*50)
print("K-NEAREST NEIGHBORS MODEL")
print("="*50)




K-NEAREST NEIGHBORS MODEL


In [33]:
# Initialize and train the KNN model with a default K value
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)


In [34]:
# Predictions on training and test sets
y_pred_train_knn = knn.predict(X_train)
y_pred_knn = knn.predict(X_test)

In [35]:
# Create a comparison DataFrame for test predictions
comparison_df_knn = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_knn})
print("\nSample of actual vs predicted values (KNN):")
print(comparison_df_knn.head(10))


Sample of actual vs predicted values (KNN):
      Actual  Predicted
706      0.0        0.0
1247     0.0        0.0
185      0.0        0.0
2274     0.0        0.0
229      0.0        0.0
3242     0.0        0.0
21       0.0        0.0
2525     0.0        0.0
3428     0.0        0.0
1842     1.0        0.0


In [36]:
# Calculate accuracy on training set
accuracy_train_knn = accuracy_score(y_train, y_pred_train_knn)
print("\nKNN training accuracy:", accuracy_train_knn)


KNN training accuracy: 0.9040199439077594


In [37]:
# Calculate accuracy on test set
accuracy_test_knn = accuracy_score(y_test, y_pred_knn)
print("KNN test accuracy:", accuracy_test_knn)


KNN test accuracy: 0.8841843088418431


In [38]:
# Confusion Matrix
print("\nConfusion Matrix (KNN):")
cm_knn = confusion_matrix(y_test, y_pred_knn, labels=knn.classes_)
print(cm_knn)


Confusion Matrix (KNN):
[[665  17]
 [ 76  45]]


In [39]:
plt.figure(figsize=(8, 6))
disp_knn = ConfusionMatrixDisplay(cm_knn, display_labels=knn.classes_)
disp_knn.plot()
plt.title('Confusion Matrix - KNN')
plt.savefig('knn_confusion_matrix.png')
plt.close()


<Figure size 800x600 with 0 Axes>

In [40]:
# Classification Report
print("\nClassification Report (KNN):")
print(classification_report(y_test, y_pred_knn))


Classification Report (KNN):
              precision    recall  f1-score   support

         0.0       0.90      0.98      0.93       682
         1.0       0.73      0.37      0.49       121

    accuracy                           0.88       803
   macro avg       0.81      0.67      0.71       803
weighted avg       0.87      0.88      0.87       803



In [41]:
# ROC Curve
plt.figure(figsize=(8, 6))
roc_knn = RocCurveDisplay.from_estimator(knn, X_test, y_test)
plt.title('ROC Curve - KNN')
plt.savefig('knn_roc_curve.png')
plt.close()


<Figure size 800x600 with 0 Axes>

# -------------------------------------------------------------
# 4. Hyperparameter Tuning
# -------------------------------------------------------------


In [56]:
# First, evaluate the baseline model with default parameters
baseline_model = LogisticRegression(random_state=42)
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)
baseline_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_baseline),
    'precision': precision_score(y_test, y_pred_baseline, average='weighted'),
    'recall': recall_score(y_test, y_pred_baseline, average='weighted'),
    'f1': f1_score(y_test, y_pred_baseline, average='weighted')
}
print("Baseline Logistic Regression metrics:")
for metric, value in baseline_metrics.items():
    print(f"{metric}: {value:.4f}")

Baseline Logistic Regression metrics:
accuracy: 0.9029
precision: 0.8960
recall: 0.9029
f1: 0.8919


In [59]:
# Define parameter grid



# Grid 1: liblinear solver
param_grid_liblinear = {
    'solver': ['liblinear'],
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000, 2000]
}

In [60]:
# Grid 2: lbfgs and newton-cg solvers
param_grid_lbfgs_newton = {
    'solver': ['lbfgs', 'newton-cg'],
    'penalty': ['l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000, 2000]
}

In [61]:
# Grid 3: saga solver
param_grid_saga = {
    'solver': ['saga'],
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000, 2000],
    'l1_ratio': [0.2, 0.5, 0.8]
}


In [62]:
# Use multiple scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted'
}

In [63]:
# Run GridSearchCV for each parameter grid separately
results = []
for param_grid, grid_name in [
    (param_grid_liblinear, "liblinear"),
    (param_grid_lbfgs_newton, "lbfgs_newton"),
    (param_grid_saga, "saga")
]:
    print(f"\nRunning GridSearchCV for {grid_name} parameter grid...")



Running GridSearchCV for liblinear parameter grid...

Running GridSearchCV for lbfgs_newton parameter grid...

Running GridSearchCV for saga parameter grid...


In [65]:
 # Initialize GridSearchCV
logreg_gscv = GridSearchCV(
        LogisticRegression(random_state=42),
        param_grid,
        cv=5,
        scoring='f1_weighted',  # Use f1 as the primary metric
        verbose=1,
        n_jobs=-1  # Use all available cores
    )


In [68]:
 # Fit GridSearchCV
logreg_gscv.fit(X_train, y_train)

    # Store results
results.append({
        'grid_name': grid_name,
        'best_params': logreg_gscv.best_params_,
        'best_score': logreg_gscv.best_score_,
        'model': logreg_gscv.best_estimator_
    })

Fitting 5 folds for each of 336 candidates, totalling 1680 fits




In [70]:
 # Print results for this grid
print(f"Best parameters for {grid_name}:", logreg_gscv.best_params_)
print(f"Best CV f1_weighted score: {logreg_gscv.best_score_:.4f}")


Best parameters for saga: {'C': 1, 'class_weight': None, 'l1_ratio': 0.2, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
Best CV f1_weighted score: 0.8806


In [71]:
# Find the overall best model
best_model_info = max(results, key=lambda x: x['best_score'])
best_model = best_model_info['model']
print("\nOverall best model:")
print(f"Grid: {best_model_info['grid_name']}")
print(f"Parameters: {best_model_info['best_params']}")
print(f"CV f1_weighted score: {best_model_info['best_score']:.4f}")


Overall best model:
Grid: saga
Parameters: {'C': 1, 'class_weight': None, 'l1_ratio': 0.2, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
CV f1_weighted score: 0.8806


In [72]:

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)

In [73]:
# Evaluation metrics
print("\nTest Set Metrics for Best Model:")
test_accuracy = accuracy_score(y_test, y_pred_best)
test_precision = precision_score(y_test, y_pred_best, average='weighted')
test_recall = recall_score(y_test, y_pred_best, average='weighted')
test_f1 = f1_score(y_test, y_pred_best, average='weighted')

print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")


Test Set Metrics for Best Model:
Accuracy: 0.9041
Precision: 0.8979
Recall: 0.9041
F1 Score: 0.8930


In [74]:
# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred_best)
print(cm)


Confusion Matrix:
[[669  13]
 [ 64  57]]


In [75]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))


Classification Report:
              precision    recall  f1-score   support

         0.0       0.91      0.98      0.95       682
         1.0       0.81      0.47      0.60       121

    accuracy                           0.90       803
   macro avg       0.86      0.73      0.77       803
weighted avg       0.90      0.90      0.89       803



Confusion Matrix:
[[669  13]
 [ 64  57]]
Accuracy: 0.9041


In [78]:
# Compare with baseline
print("\nComparison to Baseline:")
metrics = {
    'accuracy': [baseline_metrics['accuracy'], test_accuracy],
    'precision': [baseline_metrics['precision'], test_precision],
    'recall': [baseline_metrics['recall'], test_recall],
    'f1': [baseline_metrics['f1'], test_f1]
}

for metric, values in metrics.items():
    baseline_value, tuned_value = values
    diff = tuned_value - baseline_value
    print(f"{metric}: {baseline_value:.4f} → {tuned_value:.4f} (Δ: {diff:.4f})")


Comparison to Baseline:
accuracy: 0.9029 → 0.9041 (Δ: 0.0012)
precision: 0.8960 → 0.8979 (Δ: 0.0018)
recall: 0.9029 → 0.9041 (Δ: 0.0012)
f1: 0.8919 → 0.8930 (Δ: 0.0011)


In [79]:
import joblib

# Save the Logistic Regression model
joblib.dump(logreg, 'logistic_regression_model.joblib')
print("Logistic Regression model saved as 'logistic_regression_model.joblib'")

# Save the Logistic Regression tuned model
joblib.dump(logreg_gscv.best_estimator_, 'logistic_regression_tuned_model.joblib')
print("Tuned Logistic Regression model saved as 'logistic_regression_tuned_model.joblib'")

# Save the Naive Bayes model
joblib.dump(nb, 'KNN.joblib')
print("Naive Bayes model saved as 'naive_bayes_model.joblib'")




Logistic Regression model saved as 'logistic_regression_model.joblib'
Tuned Logistic Regression model saved as 'logistic_regression_tuned_model.joblib'
Naive Bayes model saved as 'naive_bayes_model.joblib'
