In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd

# Load the dataset from Google Drive
file_path = '/content/drive/My Drive/Assignment 1/Gender_Inequality_Index.csv'
data = pd.read_csv(file_path)


In [6]:
# Import necessary libraries
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [17]:
# Preprocess the dataset
# Fill missing values in all numeric columns (features and target)
imputer = SimpleImputer(strategy='mean')
X = data[['Human_development', 'Maternal_mortality', 'Adolescent_birth_rate',
          'Seats_parliament', 'F_secondary_educ', 'M_secondary_educ',
          'F_Labour_force', 'M_Labour_force']]
X = imputer.fit_transform(X)  # Apply imputation to the feature set

# Handle target variable's missing values
y = data['GII'].values.reshape(-1, 1)
y = imputer.fit_transform(y).ravel()  # Impute and flatten back to a 1D array

# Encode the categorical variable 'Human_development' in the original data
label_encoder = LabelEncoder()
data['Human_development'] = label_encoder.fit_transform(data['Human_development'])

In [18]:
# Initialize KFold with 5 splits for cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)


In [19]:
# Define models to train
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Store model performance results
results = {}


In [20]:
# Train models and evaluate with K-Fold Cross-Validation
for model_name, model in models.items():
    # Perform cross-validation and get the mean of negative MSE (to get actual MSE, take the positive value)
    neg_mse_scores = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
    mse_scores = -neg_mse_scores  # Convert to positive MSE scores
    mae_scores = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')

    # Calculate average metrics
    avg_mse = np.mean(mse_scores)
    avg_mae = -np.mean(mae_scores)  # Convert to positive MAE

    # Store results
    results[model_name] = {"MSE": avg_mse, "MAE": avg_mae}

    # Print results for each model
    print(f"{model_name} Results:")
    print(f"  Mean Squared Error (MSE): {avg_mse:.4f}")
    print(f"  Mean Absolute Error (MAE): {avg_mae:.4f}")
    print("-" * 30)


Linear Regression Results:
  Mean Squared Error (MSE): 0.0058
  Mean Absolute Error (MAE): 0.0559
------------------------------
Random Forest Results:
  Mean Squared Error (MSE): 0.0034
  Mean Absolute Error (MAE): 0.0382
------------------------------


In [21]:
# Summary of all results
print("\nModel Performance Summary:")
for model_name, metrics in results.items():
    print(f"{model_name}: MSE = {metrics['MSE']:.4f}, MAE = {metrics['MAE']:.4f}")


Model Performance Summary:
Linear Regression: MSE = 0.0058, MAE = 0.0559
Random Forest: MSE = 0.0034, MAE = 0.0382
