In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import missingno as msno  
from scipy.stats import chi2_contingency, shapiro
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import joblib  
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

warnings.filterwarnings("ignore")


In [None]:
df = pd.read_csv("lung_cancer_data.csv")
df.head()

In [None]:
# Display dataset information
df.info()

# Check for missing values
print("Missing Values in Each Column:")
print(df.isnull().sum())

# Visualize missing values
msno.matrix(df)
plt.title("Missing Values Visualization")
plt.show()

# Handle missing values using median imputation
df_cleaned = df.fillna(df.median())

print("Data cleaning completed successfully!")


In [None]:
# Apply one-hot encoding for categorical features
categorical_cols = ["Gender", "Smoking History", "Ethnicity"]
df_cleaned = pd.get_dummies(df_cleaned, columns=categorical_cols, drop_first=True)

print("Categorical variables encoded successfully!")


In [None]:
# Display summary statistics for numerical columns
df_cleaned.describe()


In [None]:
# Gender distribution
sns.countplot(data=df, x="Gender", palette="coolwarm")
plt.title("Gender Distribution")
plt.show()

# Age distribution
plt.hist(df["Age"], bins=20, color='skyblue', edgecolor='black')
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Heatmap of medical features
plt.figure(figsize=(10,8))
sns.heatmap(df_cleaned.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()


In [None]:
# Chi-Square test for categorical variables
def chi_square_test(col1, col2):
    contingency_table = pd.crosstab(df_cleaned[col1], df_cleaned[col2])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return f"Chi-Square Test between {col1} and {col2}: p-value = {p}"

# Normality Test
def normality_test(column):
    stat, p = shapiro(df_cleaned[column])
    return f"Shapiro-Wilk Test for {column}: p-value = {p}"

print(chi_square_test("Smoking History_Yes", "Lung Cancer"))
print(normality_test("Age"))

In [None]:
def get_patient_details(patient_id):
    patient = df_cleaned[df_cleaned["Patient ID"] == patient_id]
    return patient if not patient.empty else "Patient ID not found."

# User input
target_id = int(input("Enter Patient ID: "))
print(get_patient_details(target_id))

In [None]:
features = ["Age", "Diabetes", "Kidney Disease", "Haemoglobin Level"] + list(df_cleaned.columns[df_cleaned.columns.str.startswith("Smoking History")])
X = df_cleaned[features]
y = df_cleaned["Lung Cancer"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


In [None]:
# Save the trained model
joblib.dump(pipeline, "lung_cancer_model.pkl")
print("Model saved successfully!")

# Load the saved model
loaded_model = joblib.load("lung_cancer_model.pkl")

# Make predictions using the loaded model
y_pred_loaded = loaded_model.predict(X_test)

# Evaluate the loaded model
print("Loaded Model Accuracy:", accuracy_score(y_test, y_pred_loaded))


In [None]:
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='blue', label=f'Gradient Boosting (AUC = {roc_auc:.2f})')
plt.plot([0,1], [0,1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [3]:
df_cleaned.to_csv("cleaned_lung_cancer_data.csv", index=False)
print("Cleaned data saved successfully!")


NameError: name 'df_cleaned' is not defined