In [80]:
# Install necessary packages
!pip install gradio pandas joblib scikit-learn imbalanced-learn xgboost

# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# Load dataset
file_path = "/content/drive/My Drive/Colab Notebooks/Lung_Cancer_Cleaned.csv"
df = pd.read_csv(file_path)

# Encode categorical variables
label_encoder = LabelEncoder()
df["GENDER"] = label_encoder.fit_transform(df["GENDER"])  # Male = 1, Female = 0
df["LUNG_CANCER"] = label_encoder.fit_transform(df["LUNG_CANCER"])

# Scale numerical features
scaler = StandardScaler()
df[["AGE"]] = scaler.fit_transform(df[["AGE"]])

# Save the scaler for later use
scaler_path = "/content/drive/My Drive/Colab Notebooks/scaler.pkl"
joblib.dump(scaler, scaler_path)

# Define features and target
X = df.drop(columns=["LUNG_CANCER"])
y = df["LUNG_CANCER"]

# Check Original Class Distribution
print("Original Class Distribution:")
print(y.value_counts())

# Apply SMOTE to balance data
smote = SMOTE(sampling_strategy=0.6, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check Class Distribution After SMOTE
print("\nAfter SMOTE:")
print(pd.Series(y_resampled).value_counts())

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train XGBoost classifier with better hyperparameters
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    learning_rate=0.1,
    n_estimators=150,
    max_depth=5
)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
y_prob_test = model.predict_proba(X_test)[:, 1]

print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model
model_path = "/content/drive/My Drive/Colab Notebooks/lung_cancer_best_model.pkl"
joblib.dump(model, model_path)
print("\nModel saved successfully!")

# Load the trained model and scaler
if os.path.exists(model_path):
    model = joblib.load(model_path)
    print("Model loaded successfully!")
else:
    raise FileNotFoundError("Error: Model file not found!")

if os.path.exists(scaler_path):
    scaler = joblib.load(scaler_path)
    print("Scaler loaded successfully!")
else:
    raise FileNotFoundError("Error: Scaler file not found!")

# Define CSV file path to save user inputs
csv_file_path = "/content/lung_cancer_predictions.csv"

# Find the best threshold using Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_test, y_prob_test)

# Select threshold where precision & recall balance
optimal_threshold = thresholds[np.argmax(precision * recall)]
print(f"\nOptimal Decision Threshold: {optimal_threshold:.2f}")

# Define CSV file path for Tableau
tableau_csv_path = "/content/drive/My Drive/Colab Notebooks/lung_cancer_predictions_tableau.csv"

def predict_lung_cancer(gender, age, smoking, yellow_fingers, anxiety, peer_pressure, chronic_disease,
                        fatigue, allergy, wheezing, alcohol_consuming, coughing, shortness_of_breath,
                        swallowing_difficulty, chest_pain):
    try:
        # Encode gender
        gender_encoded = 1 if gender == "Male" else 0

        # Scale age
        age_scaled = scaler.transform([[age]])[0][0]

        # Prepare input data
        input_data = np.array([[gender_encoded, age_scaled, smoking, yellow_fingers, anxiety, peer_pressure,
                                chronic_disease, fatigue, allergy, wheezing, alcohol_consuming,
                                coughing, shortness_of_breath, swallowing_difficulty, chest_pain]])

        # Predict probability
        probability_yes = model.predict_proba(input_data)[0][1]

        # Apply optimal threshold
        prediction = 1 if probability_yes >= optimal_threshold else 0

        # **✅ Print prediction results before saving**
        print(f"Prediction: {'YES' if prediction == 1 else 'NO'}, Confidence: {probability_yes:.2f}")

        # Prediction output
        result = f"Lung Cancer: {'YES' if prediction == 1 else 'NO'} (Confidence: {probability_yes:.2f})"

        # Save results for Tableau
        data_dict = {
            "Gender": [gender], "Age": [age], "Smoking": [smoking], "Yellow Fingers": [yellow_fingers],
            "Anxiety": [anxiety], "Peer Pressure": [peer_pressure], "Chronic Disease": [chronic_disease],
            "Fatigue": [fatigue], "Allergy": [allergy], "Wheezing": [wheezing], "Alcohol Consuming": [alcohol_consuming],
            "Coughing": [coughing], "Shortness of Breath": [shortness_of_breath],
            "Swallowing Difficulty": [swallowing_difficulty], "Chest Pain": [chest_pain],
            "Prediction": ["YES" if prediction == 1 else "NO"],
            "Confidence Score": [probability_yes]
        }

        df_tableau = pd.DataFrame(data_dict)

        # Append data to CSV (Fixing the issue by ensuring the file is properly created)
        if os.path.exists(tableau_csv_path):
            df_tableau.to_csv(tableau_csv_path, mode='a', header=False, index=False)
        else:
            df_tableau.to_csv(tableau_csv_path, mode='w', header=True, index=False)
            print("Tableau CSV created successfully!")

        return result

    except Exception as e:
        return f"Error: {str(e)}"


# Create Gradio Interface
import gradio as gr

interface = gr.Interface(
    fn=predict_lung_cancer,
    inputs=[
        gr.Radio(["Male", "Female"], label="Gender"),
        gr.Slider(10, 100, step=1, label="Age"),
        gr.Radio([0, 1], label="Smoking"),
        gr.Radio([0, 1], label="Yellow Fingers"),
        gr.Radio([0, 1], label="Anxiety"),
        gr.Radio([0, 1], label="Peer Pressure"),
        gr.Radio([0, 1], label="Chronic Disease"),
        gr.Radio([0, 1], label="Fatigue"),
        gr.Radio([0, 1], label="Allergy"),
        gr.Radio([0, 1], label="Wheezing"),
        gr.Radio([0, 1], label="Alcohol Consuming"),
        gr.Radio([0, 1], label="Coughing"),
        gr.Radio([0, 1], label="Shortness of Breath"),
        gr.Radio([0, 1], label="Swallowing Difficulty"),
        gr.Radio([0, 1], label="Chest Pain"),
    ],
    outputs="text",
    title="Lung Cancer Prediction",
    description="Enter the symptoms and risk factors to predict lung cancer."
)

# Launch the Gradio app
interface.launch(share=True)

Original Class Distribution:
LUNG_CANCER
1    270
0     39
Name: count, dtype: int64

After SMOTE:
LUNG_CANCER
1    270
0    162
Name: count, dtype: int64

Model Accuracy: 0.9540229885057471
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        35
           1       0.96      0.96      0.96        52

    accuracy                           0.95        87
   macro avg       0.95      0.95      0.95        87
weighted avg       0.95      0.95      0.95        87



Parameters: { "use_label_encoder" } are not used.




Model saved successfully!
Model loaded successfully!
Scaler loaded successfully!

Optimal Decision Threshold: 0.38
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4e2528387197e10bcd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [81]:
import os

file_path = "/content/drive/My Drive/Colab Notebooks/lung_cancer_predictions_tableau.csv"

if os.path.exists(file_path):
    print("✅ File exists:", file_path)
else:
    print("❌ File NOT found!")

✅ File exists: /content/drive/My Drive/Colab Notebooks/lung_cancer_predictions_tableau.csv
