In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import SelectFromModel

In [None]:

df = pd.read_csv('/LungcancerDs.csv')
df.drop(['Patient Id'], axis=1, inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [None]:

df['Level'] = df['Level'].map({'Low': 0, 'Medium': 1, 'High': 2})
columns = df.columns[df.dtypes == 'object']
df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True) 

X = df.drop('Level', axis=1)
y = df['Level']



In [None]:
missing_values=df.isnull().sum()
print(missing_values)
df.duplicated().sum()

In [None]:
#Visualization
df.corr()

import matplotlib.pyplot as plt

plt.figure(figsize = (20, 25))
plotnumber = 1
for column in df:
    if plotnumber <= 9:
        ax = plt.subplot(3, 3, plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column, fontsize = 15)
        
    plotnumber += 1
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(), annot=True);
plt.show()

In [None]:
for column in df.columns:
    sns.boxplot(df[column])
    plt.title(f"Box Plot for {column}")
    plt.show()

In [None]:
overall_min=df.min()
print(overall_min)


In [None]:
total = float(df.shape[0])
ploting = sns.countplot(x='Level', data=df)
for p in ploting.patches:
    height = p.get_height()
    ploting.text(p.get_x() + p.get_width()/2,
           height,
           '{:.0f}'.format((height)),
           ha='center',fontweight='bold')
plt.title('Class Count', fontweight='bold')
plt.legend()
#plt.legend(prop={'weight':'bold'})
plt.show()

In [None]:
df['Level'] = df['Level'].map({'Low':0,'Medium':1,'High':2})
df.plot()

In [None]:
## plot the distribution

#print('Class distribution after oversampling:', pd.Series(y_resampled).value_counts())
counter = Counter(y_resampled)
print('After',counter)
pyplot.bar(counter.keys(), counter.values())
for x,y in counter.items():
    pyplot.annotate(str(y),(x,y), ha= 'center', va='bottom')
plt.title('Class Count', fontweight='bold')
plt.legend()
#plt.legend(prop={'weight':'bold'})
plt.show()

In [None]:
selector=SelectKBest(f_classif, k=9)
selector.fit(X,Y)
X_train = selector.transform(x_train)
print("Num Features before:", x_train.shape[1])
print("Num Features after:", X_train.shape[1])
####
#selector=SelectKBest(f_classif,k=13)
#selector.fit(X,Y)
mask= selector.get_support()

feature_names= X.columns[mask]

In [None]:

x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

fs_model = RandomForestClassifier(n_estimators=100, random_state=42)
fs_model.fit(x_train, y_train)
selector = SelectFromModel(fs_model, threshold='median')
selector.fit(x_train, y_train)
x_train_fs = selector.transform(x_train)
x_test_fs = selector.transform(x_test)
selected_features = X.columns[selector.get_support()].tolist()
print("Selected Features:", selected_features)

rf = RandomForestClassifier(n_estimators=30, max_depth=3, random_state=42)
svm = SVC(kernel='rbf', C=0.7, gamma='scale', probability=True, random_state=42)
lr = LogisticRegression(C=0.5, solver='liblinear', random_state=42)

ensemble = VotingClassifier(estimators=[
    ('lr', lr),
    ('rf', rf),
    ('svm', svm)
], voting='soft', weights=[1, 2, 1])

pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(k_neighbors=4, random_state=42)),
    ('model', ensemble)
])


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro']
cv_results = cross_validate(pipeline, x_train_fs, y_train, cv=cv, scoring=scoring)

print("\nCross-Validation Results (10-fold):")
for metric in scoring:
    scores = cv_results[f'test_{metric}']
    print(f"{metric}: {scores.mean():.3f} ± {scores.std():.3f}")


pipeline.fit(x_train_fs, y_train)
y_pred = pipeline.predict(x_test_fs)

print("\nClassification Report on Hold-out Test Set:")
print(classification_report(y_test, y_pred))


Selected Features: ['Alcohol use', 'Genetic Risk', 'Balanced Diet', 'Obesity', 'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue', 'Shortness of Breath', 'Wheezing', 'Swallowing Difficulty', 'Snoring']

Cross-Validation Results (10-fold):
accuracy: 0.991 ± 0.009
f1_macro: 0.991 ± 0.010
precision_macro: 0.991 ± 0.009
recall_macro: 0.991 ± 0.010

Classification Report on Hold-out Test Set:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        91
           1       1.00      0.96      0.98       100
           2       1.00      1.00      1.00       109

    accuracy                           0.99       300
   macro avg       0.99      0.99      0.99       300
weighted avg       0.99      0.99      0.99       300

Test Set Accuracy: 0.987


In [None]:
import joblib
joblib.dump(selector, "lung_selector.sav")
joblib.dump(pipeline, "lung_model.sav")

print(" Model and selector saved as lung_model.sav and lung_selector.sav")


 Model and selector saved as lung_model.sav and lung_selector.sav
