In [10]:
# Install necessary packages
!pip install pandas joblib scikit-learn imbalanced-learn xgboost

# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve
from imblearn.over_sampling import SMOTE

# Load dataset
file_path = "/content/drive/My Drive/Colab Notebooks/Lung_Cancer_Cleaned.csv"
df = pd.read_csv(file_path)

# Encode categorical variables
label_encoder = LabelEncoder()
df["GENDER"] = label_encoder.fit_transform(df["GENDER"])
df["LUNG_CANCER"] = label_encoder.fit_transform(df["LUNG_CANCER"])

# Scale numerical features
scaler = StandardScaler()
df[["AGE"]] = scaler.fit_transform(df[["AGE"]])

# Save the scaler
scaler_path = "/content/drive/My Drive/Colab Notebooks/scaler.pkl"
joblib.dump(scaler, scaler_path)

# Define features and target
X = df.drop(columns=["LUNG_CANCER"])
y = df["LUNG_CANCER"]

# Check Original Class Distribution
print("Original Class Distribution:")
print(y.value_counts())

# Apply SMOTE to balance data
smote = SMOTE(sampling_strategy=0.6, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check Class Distribution After SMOTE
print("\nAfter SMOTE:")
print(pd.Series(y_resampled).value_counts())

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train & Compare Models
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

best_model = None
best_accuracy = 0
best_model_name = ""

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)

    # Evaluate model
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    # Save the best model
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name

# Save the best model
best_model_path = "/content/drive/My Drive/Colab Notebooks/lung_cancer_best_model.pkl"
joblib.dump(best_model, best_model_path)
print(f"\nBest Model ({best_model_name}) saved at: {best_model_path} with Accuracy: {best_accuracy:.4f}")

Original Class Distribution:
LUNG_CANCER
1    270
0     39
Name: count, dtype: int64

After SMOTE:
LUNG_CANCER
1    270
0    162
Name: count, dtype: int64

Training Logistic Regression...
Logistic Regression Accuracy: 0.9425
              precision    recall  f1-score   support

           0       0.94      0.91      0.93        35
           1       0.94      0.96      0.95        52

    accuracy                           0.94        87
   macro avg       0.94      0.94      0.94        87
weighted avg       0.94      0.94      0.94        87


Training Random Forest...
Random Forest Accuracy: 0.9425
              precision    recall  f1-score   support

           0       0.92      0.94      0.93        35
           1       0.96      0.94      0.95        52

    accuracy                           0.94        87
   macro avg       0.94      0.94      0.94        87
weighted avg       0.94      0.94      0.94        87


Training XGBoost...
XGBoost Accuracy: 0.9540
              pre

Parameters: { "use_label_encoder" } are not used.

