In [None]:
!pip install scikit-learn==1.0.2
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pickle


In [None]:
# Load the dataset
data = pd.read_csv('Medical diagnosis using AI/Datasets/survey lung cancer.csv')

# Label encoding
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['GENDER'] = label_encoder.fit_transform(data['GENDER'])
data['LUNG_CANCER'] = label_encoder.fit_transform(data['LUNG_CANCER'])

# Split features and target
X = data.drop(columns='LUNG_CANCER', axis=1)
Y = data['LUNG_CANCER']

# Split into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest with hyperparameter tuning
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train_scaled, Y_train)
best_rf = grid_rf.best_estimator_

# XGBoost with hyperparameter tuning
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='accuracy')
grid_xgb.fit(X_train_scaled, Y_train)
best_xgb = grid_xgb.best_estimator_

# Evaluate Random Forest
train_pred_rf = best_rf.predict(X_train_scaled)
test_pred_rf = best_rf.predict(X_test_scaled)
train_acc_rf = accuracy_score(Y_train, train_pred_rf)
test_acc_rf = accuracy_score(Y_test, test_pred_rf)
print('Random Forest - Training Accuracy:', train_acc_rf)
print('Random Forest - Test Accuracy:', test_acc_rf)

# Evaluate XGBoost
train_pred_xgb = best_xgb.predict(X_train_scaled)
test_pred_xgb = best_xgb.predict(X_test_scaled)
train_acc_xgb = accuracy_score(Y_train, train_pred_xgb)
test_acc_xgb = accuracy_score(Y_test, test_pred_xgb)
print('XGBoost - Training Accuracy:', train_acc_xgb)
print('XGBoost - Test Accuracy:', test_acc_xgb)

# Select best model
if test_acc_rf >= test_acc_xgb:
    best_model = best_rf
    print('Selected Random Forest as best model')
else:
    best_model = best_xgb
    print('Selected XGBoost as best model')

# Save best model
filename = 'lungs_disease_model.sav'
pickle.dump(best_model, open(filename, 'wb'))
