In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from google.colab import drive
drive.mount('/content/drive') #mount google drive

# Load the dataset
file_path = '/content/drive/My Drive/google collab/Food Waste data and research - by country.csv'
df = pd.read_csv(file_path)

# Preprocessing
# Fill missing values
numeric_df = df.select_dtypes(include=['number'])
df[numeric_df.columns] = numeric_df.fillna(numeric_df.mean())

# Encode categorical variables
label_encoder = LabelEncoder()
df['Country'] = label_encoder.fit_transform(df['Country'])

# Define target and features
threshold = 100  # Example threshold to categorize high/low food waste
df['food_waste_category'] = (df['combined figures (kg/capita/year)'] > threshold).astype(int)  # 1 for high, 0 for low
X = df.drop(['food_waste_category', 'Country', 'Confidence in estimate', 'M49 code', 'Region', 'Source'], axis=1)
y = df['food_waste_category']

# Apply SMOTE to handle class imbalance
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Feature Scaling
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

# K-Fold Cross Validation and Hyperparameter Tuning
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define parameter grids for hyperparameter tuning
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100]}

# Dictionary of models with GridSearchCV for hyperparameter tuning
models = {
    'Logistic Regression': GridSearchCV(LogisticRegression(), param_grid_lr, cv=kf),
    'Random Forest': GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=kf)
}

# Cross-validation and metrics calculation
results = {}
for name, model in models.items():
    y_pred = cross_val_predict(model, X_resampled, y_resampled, cv=kf)
    accuracy = accuracy_score(y_resampled, y_pred)
    precision = precision_score(y_resampled, y_pred)
    recall = recall_score(y_resampled, y_pred)
    f1 = f1_score(y_resampled, y_pred)
    roc_auc = roc_auc_score(y_resampled, y_pred)
    conf_matrix = confusion_matrix(y_resampled, y_pred)

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'Confusion Matrix': conf_matrix
    }

# Display enhanced results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"ROC AUC: {metrics['ROC AUC']:.4f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])
    print("\n")


Mounted at /content/drive
Model: Logistic Regression
Accuracy: 0.9924
Precision: 1.0000
Recall: 0.9848
F1 Score: 0.9924
ROC AUC: 0.9924
Confusion Matrix:
[[198   0]
 [  3 195]]


Model: Random Forest
Accuracy: 0.9949
Precision: 1.0000
Recall: 0.9899
F1 Score: 0.9949
ROC AUC: 0.9949
Confusion Matrix:
[[198   0]
 [  2 196]]


