Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import plot_tree
from sklearn.decomposition import PCA

from sklearn.metrics import (
    confusion_matrix, classification_report, accuracy_score,
    precision_score, recall_score, f1_score, roc_curve, auc
)

: 

Loading Dataset

In [None]:
df = pd.read_csv("LungCancer.csv")
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df["gender"] = df["gender"].map({'M': 1, 'F': 0})
df["lung_cancer"] = df["lung_cancer"].map({'YES': 1, 'NO': 0})

EDA

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.countplot(data=df, x="gender")
plt.title("Gender Distribution")

plt.subplot(1, 2, 2)
sns.countplot(data=df, x="lung_cancer")
plt.title("Lung Cancer Class Distribution")
plt.tight_layout()
plt.show()

print("Class distribution:\n", df["lung_cancer"].value_counts())

Feature Engineering

In [None]:
X = df.drop(columns="lung_cancer")
y = df["lung_cancer"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

print(f"Train/Test sizes: {X_train.shape}, {X_test.shape}")

# Correlation Heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()


Helper Function

In [None]:
def compute_metrics(y_test, y_pred):
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }


Initializing Dictionaries

In [None]:
ml_metrics = {}
fpr_dict = {}
tpr_dict = {}
roc_auc_dict = {}


Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_prob = lr.predict_proba(X_test)[:, 1]
lr_metrics = compute_metrics(y_test, lr_pred)
ml_metrics["Logistic Regression"] = lr_metrics
fpr_dict["Logistic Regression"], tpr_dict["Logistic Regression"], _ = roc_curve(y_test, lr_prob)
roc_auc_dict["Logistic Regression"] = auc(fpr_dict["Logistic Regression"], tpr_dict["Logistic Regression"])

print("\n--- Logistic Regression ---")
print(classification_report(y_test, lr_pred))

ROC Curve for Logistic Regression

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(fpr_dict["Logistic Regression"], tpr_dict["Logistic Regression"],
         color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_dict['Logistic Regression']:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()


Actual vs Predicted

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(range(len(y_test)), list(y_test), label="Actual", linestyle='--', color='blue', alpha=0.7)
plt.plot(range(len(y_test)), list(lr_pred), label="Predicted", linestyle='-', color='red', alpha=0.7)
plt.title("Logistic Regression - Actual vs Predicted")
plt.xlabel("Sample Index")
plt.ylabel("Class")
plt.legend()
plt.tight_layout()
plt.show()

Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=40)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:, 1]
rf_metrics = compute_metrics(y_test, rf_pred)
ml_metrics["Random Forest"] = rf_metrics
fpr_dict["Random Forest"], tpr_dict["Random Forest"], _ = roc_curve(y_test, rf_prob)
roc_auc_dict["Random Forest"] = auc(fpr_dict["Random Forest"], tpr_dict["Random Forest"])
print("\n--- Random Forest ---")
print(classification_report(y_test, rf_pred))

Decision Tree 

In [None]:
plt.figure(figsize=(20, 10))
plot_tree(rf.estimators_[0], filled=True, feature_names=X.columns, class_names=['No', 'Yes'], rounded=True, fontsize=10)
plt.title("Random Forest Classifier - Decision Tree Visualization")
plt.show()

Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
nb_prob = nb.predict_proba(X_test)[:, 1]
nb_metrics = compute_metrics(y_test, nb_pred)
ml_metrics["Naive Bayes"] = nb_metrics
fpr_dict["Naive Bayes"], tpr_dict["Naive Bayes"], _ = roc_curve(y_test, nb_prob)
roc_auc_dict["Naive Bayes"] = auc(fpr_dict["Naive Bayes"], tpr_dict["Naive Bayes"])
print("\n--- Naive Bayes ---")
print(classification_report(y_test, nb_pred))

KNN 

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_prob = knn.predict_proba(X_test)[:, 1]
knn_metrics = compute_metrics(y_test, knn_pred)
ml_metrics["KNN"] = knn_metrics
fpr_dict["KNN"], tpr_dict["KNN"], _ = roc_curve(y_test, knn_prob)
roc_auc_dict["KNN"] = auc(fpr_dict["KNN"], tpr_dict["KNN"])
print("\n--- KNN ---")
print(classification_report(y_test, knn_pred))

KNN Visualization using PCA

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8, 5))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='coolwarm', edgecolor='k', alpha=0.7)
plt.title("KNN - PCA Visualization of Classes")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.grid(True)
plt.colorbar(label='Lung Cancer (0=No, 1=Yes)')
plt.tight_layout()
plt.show()

Support Vector Machine Classifier

In [None]:
svm = SVC(kernel='linear', C=0.5, probability=True)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_prob = svm.predict_proba(X_test)[:, 1]
svm_metrics = compute_metrics(y_test, svm_pred)
ml_metrics["SVM"] = svm_metrics
fpr_dict["SVM"], tpr_dict["SVM"], _ = roc_curve(y_test, svm_prob)
roc_auc_dict["SVM"] = auc(fpr_dict["SVM"], tpr_dict["SVM"])
print("\n--- Support Vector Machine ---")
print(classification_report(y_test, svm_pred))


Confusion Matrices 

In [None]:
color_maps = {
    'Logistic Regression': 'Blues',
    'Random Forest': 'YlOrRd',
    'Naive Bayes': 'GnBu',
    'KNN': 'PuBu',
    'SVM': 'BuGn'
}
models = {'Logistic Regression': lr, 'Random Forest': rf, 'Naive Bayes': nb, 'KNN': knn, 'SVM': svm}

for model_name, model in models.items():
    pred = model.predict(X_test)
    cm = confusion_matrix(y_test, pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap=color_maps[model_name], cbar=False,
                xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
    plt.title(f"Confusion Matrix - {model_name}")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()