## Loading Data


In [19]:
!pip install codecarbon catboost



In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import json
import os
import logging
import sys
from codecarbon import EmissionsTracker

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
df= pd.read_csv('/content/drive/MyDrive/Scriptie uitvoeringsfase (workfolder) Uni/Processed data/pa_cg.csv')

In [23]:
df.shape

(538053, 41)

In [24]:
df.drop(columns=['Unnamed: 0','VKL_NUMMER'],axis=1,inplace=True)

In [25]:
nan_rows = df[df['WGD_CODE_1'].isna()]

# Report the number of rows with SEVERE=1 and NaN in 'WGD_CODE_1'
severe_nan_rows = nan_rows[nan_rows['SEVERE'] == 1]
deleted_count = len(severe_nan_rows)
print(f"Number of rows with SEVERE=1 and NaN in 'WGD_CODE_1': {deleted_count}")
print(f"Total rows before deletion: {df.shape[0]}")

# Delete rows with NaN values in 'WGD_CODE_1'
df = df.dropna(subset=['WGD_CODE_1'])

# Display the cleaned DataFrame
print(f"Total rows after deletion: {df.shape[0]}")

Number of rows with SEVERE=1 and NaN in 'WGD_CODE_1': 1579
Total rows before deletion: 538053
Total rows after deletion: 523186


In [26]:
df['Max Age Group'].value_counts()

Max Age Group
8.0    150329
7.0    133288
9.0     99269
6.0     80003
5.0     53107
4.0      5099
3.0      1554
2.0       485
1.0        52
Name: count, dtype: int64

In [27]:
columns_to_drop=['MNE_CODE', 'AOL_ID', 'Mode Age Group', 'Ratio_female','IND_ALC']

df.drop(columns=columns_to_drop,axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=columns_to_drop,axis=1, inplace=True)


## Models

In [28]:
# Create a custom logger
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Create handlers
stream_handler = logging.StreamHandler(sys.stdout)
stream_handler.setLevel(logging.DEBUG)

# Create formatters and add it to handlers
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)

# Add handlers to the logger
if not logger.handlers:
    logger.addHandler(stream_handler)

# Test logging to ensure it's working correctly
logger.debug("This is a debug message.")
logger.info("This is an info message.")
logger.warning("This is a warning message.")
logger.error("This is an error message.")
logger.critical("This is a critical message.")
sys.stdout.flush()


DEBUG:root:This is a debug message.
INFO:root:This is an info message.
ERROR:root:This is an error message.
CRITICAL:root:This is a critical message.


In [29]:
# Define random state for reproducibility
random_state = 42

# Define column categories
nominal_columns = [ 'BEBKOM', 'WVL_ID', 'WVG_ID', 'WDK_ID', 'WSE_ID', 'WGD_CODE_1', 'sorted_hexbin_id', 'YEAR', 'TYPE', 'PVE_CODE']
ordinal_columns = ['Daylight_ID']
numerical_columns = ['MeanWindSpeed_mps', 'MaxHourlyWindSpeed_mps', 'MaxWindGust_mps', 'MeanTemperature_C', 'MinTemperature_C', 'MaxTemperature_C', 'SunshineDuration_hrs', 'PrecipitationDuration_hrs', 'MeanSLPressure_hPa', 'MaxVisibility_km', 'MeanCloudCover_oct',  'MAXSNELHD']
log_columns = ['TotalDailyPrecip_mm']
binary_columns = [ 'WEEKEND']
hour_column = ['HOUR']
day_column = ['WEEKDAY']
month_column = ['MONTH']

# Ensure all columns are categorized
all_columns = set(df.columns)
categorized_columns = set(nominal_columns + ordinal_columns + numerical_columns + log_columns + hour_column + day_column + month_column + binary_columns)
uncategorized_columns = all_columns - categorized_columns
logger.info(f"Uncategorized columns: {uncategorized_columns}")
sys.stdout.flush()


INFO:root:Uncategorized columns: {'SEVERE', 'Stratify_Key', 'Max Age Group', 'Ratio_male', 'Combined Group', 'Min Age Group'}


In [30]:
# Define cyclical encoding function
def cyclical_encode(df, max_val):
    return pd.DataFrame({
        df.columns[0] + '_sin': np.sin(2 * np.pi * df.iloc[:, 0] / max_val),
        df.columns[0] + '_cos': np.cos(2 * np.pi * df.iloc[:, 0] / max_val)
    })

hour_transformer = FunctionTransformer(cyclical_encode, kw_args={'max_val': 24}, validate=False)
weekday_transformer = FunctionTransformer(cyclical_encode, kw_args={'max_val': 7}, validate=False)
month_transformer = FunctionTransformer(cyclical_encode, kw_args={'max_val': 12}, validate=False)

# Fit encoders on the entire dataset
one_hot_encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

df_nominal = df[nominal_columns]
df_ordinal = df[ordinal_columns]

one_hot_encoder.fit(df_nominal)
ordinal_encoder.fit(df_ordinal)

In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        ('nom', one_hot_encoder, nominal_columns),
        ('ord', ordinal_encoder, ordinal_columns),
        ('num', StandardScaler(), numerical_columns),
        ('log', FunctionTransformer(np.log1p), log_columns),
        ('hour_cyc', hour_transformer, hour_column),
        ('weekday_cyc', weekday_transformer, day_column),
        ('month_cyc', month_transformer, month_column),
        ('bin', 'passthrough', binary_columns)
    ],
    remainder='drop'
)

In [32]:
# Define models
models = [
    ('Logistic Regressor', LogisticRegression(random_state=random_state, max_iter=10000))
]



In [33]:

# Splitting the data into train and test sets
train_data_full, test_data = train_test_split(df, test_size=0.20, stratify=df['Stratify_Key'], random_state=random_state)

# Save the test set
output_dir = '/content/drive/MyDrive/Scriptie uitvoeringsfase (workfolder) Uni/Results Disparate Impact - Pre Accident Wave 1 - V60'
os.makedirs(output_dir, exist_ok=True)
test_data.to_csv(f'{output_dir}/test_set.csv', index=False)

# Drop the 'Stratify_key' and 'Combined Group' from the test set
test_data = test_data.drop(columns=['Stratify_Key', 'Combined Group'])

# Drop the 'Stratify_key' and 'Combined Group' from the full training set as well
full_training_set = train_data_full.drop(columns=['Stratify_Key', 'Combined Group'])

# Identifying categorical features for SMOTENC using nominal_columns
categorical_features = [full_training_set.columns.get_loc(col) for col in nominal_columns]



In [34]:
def create_separate_training_sets(df):
    # Assuming df is your full_training_set DataFrame
    full_training_set = df.copy()

    # Define bins for 'Ratio_male'
    bins = [-0.01, 0.25, 0.75, 1.01]  # These bins can be adjusted as needed
    labels = ['low', 'medium', 'high']
    full_training_set['Male Ratio Bin'] = pd.cut(full_training_set['Ratio_male'], bins=bins, labels=labels)

    # Create dictionaries to hold the separate sets for each grouping
    max_age_sets = {}
    min_age_sets = {}
    male_ratio_sets = {}

    # Group by 'Max Age Group'
    grouped_max_age = full_training_set.groupby('Max Age Group')
    for name, group in grouped_max_age:
        subset_name = f"MaxAge_{name}"
        max_age_sets[subset_name] = {
            'X': group.drop(columns=['Max Age Group', 'Min Age Group', 'Male Ratio Bin']),
            'y': group['SEVERE']
        }

    # Group by 'Min Age Group'
    grouped_min_age = full_training_set.groupby('Min Age Group')
    for name, group in grouped_min_age:
        subset_name = f"MinAge_{name}"
        min_age_sets[subset_name] = {
            'X': group.drop(columns=['Min Age Group', 'Max Age Group', 'Male Ratio Bin']),
            'y': group['SEVERE']
        }

    # Group by 'Male Ratio Bin'
    grouped_male_ratio = full_training_set.groupby('Male Ratio Bin')
    for name, group in grouped_male_ratio:
        subset_name = f"MaleRatio_{name}"
        male_ratio_sets[subset_name] = {
            'X': group.drop(columns=['Male Ratio Bin', 'Max Age Group', 'Min Age Group']),
            'y': group['SEVERE']
        }

    return max_age_sets, min_age_sets, male_ratio_sets

def get_resampled_sets(X, y):
    # Original set
    original_set = (X, y)

    # Undersampled set
    rus = RandomUnderSampler(sampling_strategy='auto', random_state=random_state)
    X_undersampled, y_undersampled = rus.fit_resample(X, y)

    return {
        'original': original_set,
        'undersampled': (X_undersampled, y_undersampled)
    }
def train_and_evaluate_model(preprocessor, model, model_name, X, y, test_data, output_dir, categorical_features, group_name, subset_name, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    all_results = []

    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        fold += 1
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        for resample_type, (X_resampled, y_resampled) in get_resampled_sets(X_train, y_train).items():
            tracker = EmissionsTracker(output_dir=output_dir, log_level='error')
            tracker.start()

            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier', model)
            ])

            logger.info(f"Fitting the model using {resample_type} data for fold {fold}...")
            sys.stdout.flush()
            pipeline.fit(X_resampled, y_resampled)
            logger.info("Model fitting completed.")
            sys.stdout.flush()

            # Validation set evaluation
            y_pred_val = pipeline.predict(X_val)
            y_pred_val_proba = pipeline.predict_proba(X_val)[:, 1] if hasattr(pipeline, "predict_proba") else pipeline.decision_function(X_val)
            report_val = classification_report(y_val, y_pred_val, output_dict=True)
            auc_val = roc_auc_score(y_val, y_pred_val_proba)
            cm_val = confusion_matrix(y_val, y_pred_val)
            validation_result = {
                'model': model_name,
                'group': group_name,
                'subset': subset_name,
                'resample_type': resample_type,
                'evaluation_type': 'validation',
                'fold': fold,
                'classification_report': report_val,
                'auc': auc_val,
                'confusion_matrix': cm_val.tolist()
            }

            # Calculate feature importance for Logistic Regression
            if isinstance(model, LogisticRegression):
                feature_importance = abs(pipeline.named_steps['classifier'].coef_[0])

                # Get feature names after transformation
                one_hot_features = pipeline.named_steps['preprocessor'].named_transformers_['nom'].get_feature_names_out()
                ordinal_features = ordinal_columns
                numeric_features = numerical_columns + log_columns + hour_column + day_column + month_column + binary_columns
                feature_names = np.concatenate([one_hot_features, ordinal_features, numeric_features])

                feature_importance_dict = {name: importance for name, importance in zip(feature_names, feature_importance)}
                validation_result['feature_importance'] = feature_importance_dict

            all_results.append(validation_result)

            val_predictions = pd.DataFrame({
                'Actual': y_val,
                'Predicted': y_pred_val,
                'Predicted_Proba': y_pred_val_proba
            })
            val_predictions.to_csv(f'{output_dir}/{model_name}_predictions_{group_name}_{subset_name}_{resample_type}_validation_fold_{fold}.csv', index=False)

            # Test set evaluation
            logger.info(f"Evaluating {model_name} on the full test set using {resample_type} data for fold {fold}...")
            sys.stdout.flush()
            X_test_preprocessed = pipeline.named_steps['preprocessor'].transform(test_data.drop(columns=['SEVERE']))
            y_test = test_data['SEVERE']
            y_pred_test = pipeline.named_steps['classifier'].predict(X_test_preprocessed)
            y_pred_test_proba = pipeline.named_steps['classifier'].predict_proba(X_test_preprocessed)[:, 1] if hasattr(pipeline.named_steps['classifier'], "predict_proba") else pipeline.named_steps['classifier'].decision_function(X_test_preprocessed)
            report_test = classification_report(y_test, y_pred_test, output_dict=True)
            auc_test = roc_auc_score(y_test, y_pred_test_proba)
            cm_test = confusion_matrix(y_test, y_pred_test)
            all_results.append({
                'model': model_name,
                'group': group_name,
                'subset': subset_name,
                'resample_type': resample_type,
                'evaluation_type': 'test',
                'fold': fold,
                'classification_report': report_test,
                'auc': auc_test,
                'confusion_matrix': cm_test.tolist()
            })

            test_predictions = pd.DataFrame({
                'Actual': y_test,
                'Predicted': y_pred_test,
                'Predicted_Proba': y_pred_test_proba
            })
            test_predictions.to_csv(f'{output_dir}/{model_name}_predictions_{group_name}_{subset_name}_{resample_type}_test_fold_{fold}.csv', index=False)

            emissions = tracker.stop()
            emissions_data = {
                'emissions_kg_CO2eq': emissions,
                'energy_consumed_kWh': tracker.final_emissions_data.energy_consumed,
                'duration_seconds': tracker.final_emissions_data.duration,
                'cpu_energy_kWh': tracker.final_emissions_data.cpu_energy,
                'gpu_energy_kWh': tracker.final_emissions_data.gpu_energy,
                'ram_energy_kWh': tracker.final_emissions_data.ram_energy,
            }

            # Add emissions data to each result
            for result in all_results:
                if result['fold'] == fold and result['resample_type'] == resample_type:
                    result.update(emissions_data)

    results_file = f'{output_dir}/classification_reports.json'
    if os.path.exists(results_file):
        with open(results_file, 'r') as f:
            existing_results = json.load(f)
        all_results.extend(existing_results)

    with open(results_file, 'w') as f:
        json.dump(all_results, f, indent=4)

    results_df = pd.json_normalize(all_results)
    results_df.to_csv(f'{output_dir}/classification_reports.csv', index=False)
    logger.info(f"Results and predictions for {model_name} saved to {output_dir}")


In [35]:
# Combine all the sets into one dictionary for convenience
max_age_sets, min_age_sets, male_ratio_sets = create_separate_training_sets(full_training_set)
train_eval_sets = {**max_age_sets, **min_age_sets, **male_ratio_sets}

# Adding the full training set to the evaluation sets
train_eval_sets['full_training_set'] = {
    'X': full_training_set.drop(columns=['SEVERE', 'Max Age Group', 'Min Age Group', 'Male Ratio Bin']),
    'y': full_training_set['SEVERE']
}


In [37]:
# Combine all the sets into one dictionary for convenience
max_age_sets, min_age_sets, male_ratio_sets = create_separate_training_sets(full_training_set)
train_eval_sets = {**max_age_sets, **min_age_sets, **male_ratio_sets}

# Print the shapes of each dataset
for set_name, data in train_eval_sets.items():
    X_shape = data['X'].shape
    y_shape = data['y'].shape
    print(f"Dataset: {set_name}, X shape: {X_shape}, y shape: {y_shape}")

Dataset: MaxAge_1.0, X shape: (38, 30), y shape: (38,)
Dataset: MaxAge_2.0, X shape: (399, 30), y shape: (399,)
Dataset: MaxAge_3.0, X shape: (1235, 30), y shape: (1235,)
Dataset: MaxAge_4.0, X shape: (4093, 30), y shape: (4093,)
Dataset: MaxAge_5.0, X shape: (42595, 30), y shape: (42595,)
Dataset: MaxAge_6.0, X shape: (63950, 30), y shape: (63950,)
Dataset: MaxAge_7.0, X shape: (106441, 30), y shape: (106441,)
Dataset: MaxAge_8.0, X shape: (120336, 30), y shape: (120336,)
Dataset: MaxAge_9.0, X shape: (79461, 30), y shape: (79461,)
Dataset: MinAge_1.0, X shape: (376, 30), y shape: (376,)
Dataset: MinAge_2.0, X shape: (3539, 30), y shape: (3539,)
Dataset: MinAge_3.0, X shape: (8692, 30), y shape: (8692,)
Dataset: MinAge_4.0, X shape: (18017, 30), y shape: (18017,)
Dataset: MinAge_5.0, X shape: (117121, 30), y shape: (117121,)
Dataset: MinAge_6.0, X shape: (109980, 30), y shape: (109980,)
Dataset: MinAge_7.0, X shape: (89162, 30), y shape: (89162,)
Dataset: MinAge_8.0, X shape: (49215, 

In [None]:
break

In [18]:
# Train and evaluate each model on each training set
for model_name, model in models:
    for subset_name, subset_data in train_eval_sets.items():
        logger.info(f"Starting evaluation for model: {model_name} on {subset_name}")
        X = subset_data['X']
        y = subset_data['y']
        train_and_evaluate_model(preprocessor, model, model_name, X, y, test_data, output_dir, categorical_features, 'group', subset_name)
        logger.info(f"Finished evaluation for model: {model_name} on {subset_name}")

INFO:root:Starting evaluation for model: Logistic Regressor on MaxAge_1.0
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 169.254.169.254:80
DEBUG:urllib3.connectionpool:http://169.254.169.254:80 "GET /latest/dynamic/instance-identity/document HTTP/1.1" 400 1
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 169.254.169.254:80
DEBUG:urllib3.connectionpool:http://169.254.169.254:80 "GET /metadata/instance?api-version=2019-08-15 HTTP/1.1" 400 1
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 169.254.169.254:80
DEBUG:urllib3.connectionpool:http://169.254.169.254:80 "GET /computeMetadata/v1/instance/?recursive=true&alt=json HTTP/1.1" 200 36
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): get.geojs.io:443
DEBUG:urllib3.connectionpool:https://get.geojs.io:443 "GET /v1/ip/geo.json HTTP/1.1" 200 None
INFO:root:Fitting the model using original data for fold 1...
INFO:root:Model fitting completed.
INFO:root:Evaluating Logistic Regressor 

In [None]:
from google.colab import runtime
runtime.unassign()