In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Install imbalanced-learn if not already installed
# !pip install imbalanced-learn

# Load the dataset
file_path = '/content/drive/My Drive/Handwashing with soap.csv'
data = pd.read_csv(file_path)

# Preprocessing
# Drop columns with mostly missing values
data = data.drop(columns=['Low', 'High', 'Comments'])

# Fill missing values for numeric columns
data['Numeric'].fillna(data['Numeric'].mean(), inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
data['Country'] = label_encoder.fit_transform(data['Country'])
data['WHO region'] = label_encoder.fit_transform(data['WHO region'])
data['Residence Area Type'] = label_encoder.fit_transform(data['Residence Area Type'])

# Define target and features
# Creating a binary target variable based on handwashing coverage (Numeric column) with a 50% threshold
threshold = 50
data['handwashing_category'] = (data['Numeric'] > threshold).astype(int)  # 1 for above 50%, 0 for below 50%

# Selecting relevant features
X = data[['Year', 'Country', 'WHO region', 'Residence Area Type', 'Display Value']]
y = data['handwashing_category']

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Define the models and their hyperparameters for tuning
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100]}

# Define models with GridSearchCV
models = {
    'Logistic Regression': GridSearchCV(LogisticRegression(max_iter=500), param_grid_lr, cv=5),
    'Random Forest': GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
}

# Dictionary for storing results
results = {}

# K-Fold Cross Validation and Metrics Calculation
for name, model in models.items():
    # Define a pipeline with scaling and the model
    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),  # Standardize the features
        ('model', model)  # Model with hyperparameter tuning
    ])

    # Fit the pipeline
    pipeline.fit(X_resampled, y_resampled)

    # Cross-validation prediction
    y_pred = cross_val_predict(pipeline, X_resampled, y_resampled, cv=5)

    # Calculate metrics
    accuracy = accuracy_score(y_resampled, y_pred)
    precision = precision_score(y_resampled, y_pred)
    recall = recall_score(y_resampled, y_pred)
    f1 = f1_score(y_resampled, y_pred)
    roc_auc = roc_auc_score(y_resampled, y_pred)
    conf_matrix = confusion_matrix(y_resampled, y_pred)

    # Store results
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'Confusion Matrix': conf_matrix
    }

    # Save the best estimator if it's Random Forest
    if name == 'Random Forest':
        best_rf_model = model.best_estimator_
        scaler = StandardScaler().fit(X_resampled)
        joblib.dump(best_rf_model, '/content/drive/My Drive/best_rf_model.pkl')
        joblib.dump(scaler, '/content/drive/My Drive/scaler.pkl')
        print("Best Random Forest model and scaler saved successfully.")

# Display enhanced results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"ROC AUC: {metrics['ROC AUC']:.4f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])
    print("\n")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Numeric'].fillna(data['Numeric'].mean(), inplace=True)


Best Random Forest model and scaler saved successfully.
Model: Logistic Regression
Accuracy: 0.9889
Precision: 0.9896
Recall: 0.9882
F1 Score: 0.9889
ROC AUC: 0.9889
Confusion Matrix:
[[1427   15]
 [  17 1425]]


Model: Random Forest
Accuracy: 0.9976
Precision: 0.9986
Recall: 0.9965
F1 Score: 0.9976
ROC AUC: 0.9976
Confusion Matrix:
[[1440    2]
 [   5 1437]]


