In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from google.colab import drive
import joblib

# Mount Google Drive
drive.mount('/content/drive')

# Load the Excel file
file_path = '/content/drive/My Drive/7 nov as_1/Zero_Hunger.xlsx'
df = pd.read_excel(file_path)

# Step 1: Fill missing values in numeric columns
numeric_df = df.select_dtypes(include=['number'])
df[numeric_df.columns] = numeric_df.fillna(numeric_df.mean())

# Step 2: Encode categorical variables
label_encoder = LabelEncoder()
if 'Entity' in df.columns:  # Check if 'Entity' column exists to avoid KeyError
    df['Entity'] = label_encoder.fit_transform(df['Entity'])
else:
    print("Error: 'Entity' column not found in data.")

# Convert continuous target to categorical classes for classification
# Define bins for undernourishment levels, e.g., Low, Medium, High
bins = [0, 5, 15, 100]  # Adjust these bins as necessary
labels = ['Low', 'Medium', 'High']
df['Undernourishment_Category'] = pd.cut(df['Prevalence of undernourishment (% of population)'], bins=bins, labels=labels)

# Encode the categorical target labels to numeric values for SMOTE compatibility
df['Undernourishment_Category'] = label_encoder.fit_transform(df['Undernourishment_Category'])

# Replace the target variable 'y' with the new encoded categorical variable
drop_columns = ['Entity', 'Code', 'Prevalence of undernourishment (% of population)']  # Drop original continuous target
X = df.drop(columns=drop_columns, errors='ignore')
y = df['Undernourishment_Category']

# Step 4: Apply SMOTE to handle class imbalance
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 5: Feature Scaling
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

# Save the scaler for future use
joblib.dump(scaler, '/content/drive/My Drive/7 nov as_1/scaler.pkl')

# Step 6: K-Fold Cross Validation and Hyperparameter Tuning
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define parameter grids for hyperparameter tuning
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100]}

# Dictionary of models with GridSearchCV for hyperparameter tuning
models = {
    'Logistic Regression': GridSearchCV(LogisticRegression(), param_grid_lr, cv=kf),
    'Random Forest': GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=kf)
}

# Step 7: Cross-validation and metrics calculation
results = {}
best_model_name = None
best_model_score = 0
best_model = None

for name, model in models.items():
    # Fit the model
    model.fit(X_resampled, y_resampled)
    best_estimator = model.best_estimator_

    # Predict using cross-validation
    y_pred = cross_val_predict(best_estimator, X_resampled, y_resampled, cv=kf)
    accuracy = accuracy_score(y_resampled, y_pred)
    precision = precision_score(y_resampled, y_pred, average='weighted')
    recall = recall_score(y_resampled, y_pred, average='weighted')
    f1 = f1_score(y_resampled, y_pred, average='weighted')
    roc_auc = roc_auc_score(pd.get_dummies(y_resampled), pd.get_dummies(y_pred), average='weighted')
    conf_matrix = confusion_matrix(y_resampled, y_pred)

    # Save metrics for each model
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'Confusion Matrix': conf_matrix
    }

    # Identify the best model based on ROC AUC score
    if roc_auc > best_model_score:
        best_model_score = roc_auc
        best_model_name = name
        best_model = best_estimator

# Save the best model
joblib.dump(best_model, '/content/drive/My Drive/7 nov as_1/best_model.pkl')

# Display results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"ROC AUC: {metrics['ROC AUC']:.4f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])
    print("\n")

# Print details of the best model
print(f"Best Model: {best_model_name}")
print(f"Best Model ROC AUC Score: {best_model_score:.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model: Logistic Regression
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000
Confusion Matrix:
[[1502    0    0]
 [   0 1502    0]
 [   0    0 1502]]


Model: Random Forest
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000
Confusion Matrix:
[[1502    0    0]
 [   0 1502    0]
 [   0    0 1502]]


Best Model: Logistic Regression
Best Model ROC AUC Score: 1.0000
