## Split the data into testing and training

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [17]:
#Load the dataset
df = pd.read_csv("/Users/bandanaadhikari/Desktop/classassignment/bandanaa/Python/COPD_prediction/DATA/engineered_COPD_data.csv")

In [18]:
df.head()

Unnamed: 0,Age,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Pollution_Risk_Score,Smoking_Status_encoded,Smoking_Pollution_interaction,Location_Biratnagar,Location_Butwal,Location_Chitwan,Location_Dharan,Location_Hetauda,Location_Kathmandu,Location_Lalitpur,Location_Nepalgunj,Location_Pokhara
0,31,1,1,1,27.56,84,0,0,0,0.5,42.0,False,False,False,False,False,False,True,False,False
1,60,1,0,0,30.3,131,1,0,0,0.0,0.0,False,False,False,False,False,False,False,False,True
2,33,0,0,1,28.45,123,1,0,0,0.5,61.5,False,False,False,False,False,False,False,False,True
3,36,1,0,0,27.49,253,0,1,1,1.0,253.0,False,False,False,False,False,True,False,False,False
4,58,0,0,0,25.49,117,1,0,0,0.0,0.0,False,False,False,False,False,False,False,False,True


In [19]:
# Define the features and target variable
X,y = df.drop(columns=["COPD_Diagnosis"]),df["COPD_Diagnosis"]

In [20]:
#Split the data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Model training 
We are  using following models  for the prediction if someone has a COPD or not, as being based on data we knew that it's a binary classification :
* Logistic Regression
* Decision Trees 
* Random Forest 

In [21]:
# import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle #to choose best model out of these we use pickle

In [22]:
# initialize the model
models = {
    "Logistic Regression":LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [23]:
# Train the models
for name, model in models.items():
    model.fit(X_train, y_train)

    #Save the models
    with open(f"../../models/{name.replace('','_')}.pkl","wb")as file:
        pickle.dump(model,file)

        print(f"{name} model trained and saved")
        print("Model training completed")

Logistic Regression model trained and saved
Model training completed
Decision Tree model trained and saved
Model training completed
Random Forest model trained and saved
Model training completed


# Evaluate the models

* Accuracy, Precision, Recall, F1 score

In [24]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Evaluation: ")
    print(classification_report(y_test, y_pred))



Logistic Regression Evaluation: 
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       134
           1       0.95      0.94      0.95        66

    accuracy                           0.96       200
   macro avg       0.96      0.96      0.96       200
weighted avg       0.96      0.96      0.96       200


Decision Tree Evaluation: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Random Forest Evaluation: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00     

# Model Refinement

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
#Define the parameter grid for Random forest
param_grid = {
    'n_estimators': [50,100,200],
      'max_depth': [None,10,20,30],
      'min_samples_split':[2,5,10]
}



In [27]:
#Initialize the GridSearchCV
grid_search = GridSearchCV(estimator = RandomForestClassifier(),param_grid= param_grid,cv =5, n_jobs =-1,scoring='accuracy')

In [28]:
#Fit the gridSearch cv
grid_search.fit(X_train, y_train)

In [30]:
#Best parameters
print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}


In [31]:
#Save the best model
with open('Best_Random_Forest_Model.pk1', 'wb')as file:
    pickle.dump(best_model,file)

    print("Model refinement completed and best model saved")

Model refinement completed and best model saved
