In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from tensorflow import keras
from tensorflow.keras import layers
import ast  # Library for handling literal_eval
from geopy.distance import geodesic

In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

def preprocess_df(df):
    quantile_001 = df['duration'].quantile(0.01)
    quantile_099 = df['duration'].quantile(0.99)
    df_filtered = df[(df['duration'] >= quantile_001) & (df['duration'] <= quantile_099) & (df['duration'] > 1)]
    return df_filtered.dropna().sample(frac=1).reset_index(drop=True)

def load_and_preprocess(path):
    df = pd.read_csv(path)
    return preprocess_df(df)

def generate_features_labels(df, threshold=25):
    X = df.drop(['duration'], axis=1)
    y = (df['duration'] > threshold).astype(int)
    return X, y
import lightgbm as lgb
def train_model(X_train, y_train, model=lgb.LGBMClassifier(random_state=42)):#XGBClassifier(n_estimators=200,n_jobs=10)):
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    print(classification_report(y_test, y_pred))
# Prepare data, then train & test for COVID and Pre-COVID datasets
def prepare_train_test_evaluate(df, test_size=0.2, random_state=420):
    X, y = generate_features_labels(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Standardize features
    X_train_scaled, X_test_scaled = X_train, X_test

    # Train model
    print("Training the model...")
    clf = train_model(X_train_scaled, y_train)

    # Evaluate model
    print("Testing the model...")
    evaluate_model(clf, X_test_scaled, y_test)
    
    
# Main workflow
if __name__ == "__main__":
    print("Loading the datasets...")
    df_covid = load_and_preprocess('merged_covid.csv')
    df_pre_covid = load_and_preprocess('merged_before.csv')

    print("Preparing the COVID dataset for training...")
    X_covid, y_covid = generate_features_labels(df_covid)
    X_covid_scaled = X_covid#standardize_features(X_covid)

    print("Preparing the Pre-COVID dataset for testing...")
    X_pre_covid, y_pre_covid = generate_features_labels(df_pre_covid)
    _, X_pre_covid_scaled = X_covid, X_pre_covid  # Ensure both datasets are scaled using the COVID scaler

    print("Training the model on COVID data...")
    clf_covid = train_model(X_covid_scaled, y_covid)

    print("Testing the model on Pre-COVID data...")
    evaluate_model(clf_covid, X_pre_covid_scaled, y_pre_covid)

    print("Training the model on Pre-COVID data...")
    clf_pre_covid = train_model(X_pre_covid_scaled, y_pre_covid)

    print("Testing the model on COVID data...")
    evaluate_model(clf_pre_covid, X_covid_scaled, y_covid)
    
    print("\nCOVID dataset evaluation:")
    prepare_train_test_evaluate(df_covid)

    print("\nPre-COVID dataset evaluation:")
    prepare_train_test_evaluate(df_pre_covid)

    # Additional scenarios follow similar structure
# pre-COVID dataset for both training and testing
# COVID dataset for both training and testing

Loading the datasets...
Preparing the COVID dataset for training...
Preparing the Pre-COVID dataset for testing...
Training the model on COVID data...
Testing the model on Pre-COVID data...
Accuracy: 0.66
              precision    recall  f1-score   support

           0       0.64      0.51      0.56     22674
           1       0.67      0.78      0.72     28806

    accuracy                           0.66     51480
   macro avg       0.65      0.64      0.64     51480
weighted avg       0.65      0.66      0.65     51480

Training the model on Pre-COVID data...
Testing the model on COVID data...
Accuracy: 0.67
              precision    recall  f1-score   support

           0       0.63      0.56      0.59     12637
           1       0.69      0.75      0.72     16682

    accuracy                           0.67     29319
   macro avg       0.66      0.66      0.66     29319
weighted avg       0.67      0.67      0.67     29319


COVID dataset evaluation:
Training the model...
Te

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import lightgbm as lgb
def preprocess_df(df):
    quantile_001 = df['duration'].quantile(0.02)
    quantile_099 = df['duration'].quantile(0.98)
    df_filtered = df[(df['duration'] >= quantile_001) & (df['duration'] <= quantile_099) & (df['duration'] > 5)]
    return df_filtered.dropna().sample(frac=1).reset_index(drop=True)

def load_and_preprocess(path):
    df = pd.read_csv(path)
    return preprocess_df(df)

def generate_features_labels(df):
    X = df.drop(['duration'], axis=1)
    y = np.log1p(df['duration'])
    return X, y

def train_model(X_train, y_train, model=lgb.LGBMRegressor(n_estimators=300,random_state=42)):#XGBRegressor(n_estimators=200, n_jobs=10, objective='reg:squarederror')):
    model.fit(X_train, y_train)
    return model

from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(model, X_test, y_test):
    y_test=np.expm1(y_test)
    y_pred = np.expm1(model.predict(X_test))
    
    # Calculating Mean Squared Error (MSE) and R^2 Score
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Calculating Mean Absolute Percentage Error (MAPE)
    # Avoid division by zero by replacing 0 with a very small number (np.finfo(float).eps)
    y_test_safe = np.where(y_test == 0, np.finfo(float).eps, y_test)
    mape = np.mean(np.abs((y_test - y_pred) / y_test_safe)) * 100
    
    # Printing the evaluation metrics
    print(f'RMSE: {np.sqrt(mse):.2f}')
    print(f'R^2: {r2:.2f}')
    print(f'MAPE: {mape:.2f}%')

def prepare_train_test_evaluate(df, test_size=0.2, random_state=420):
    X, y = generate_features_labels(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print("Training the model...")
    clf = train_model(X_train_scaled, y_train)

    print("Testing the model...")
    evaluate_model(clf, X_test_scaled, y_test)
    
    
# Main workflow
if __name__ == "__main__":
    print("Loading the datasets...")
    df_covid = load_and_preprocess('merged_covid.csv')
    df_pre_covid = load_and_preprocess('merged_before.csv')

    # Training on COVID data and testing on Pre-COVID data
    print("Training the model on COVID data and testing on Pre-COVID data...")
    prepare_train_test_evaluate(df_covid)
    prepare_train_test_evaluate(df_pre_covid)

    # Training on Pre-COVID data and testing on COVID data
    print("Training the model on Pre-COVID data and testing on COVID data...")
    prepare_train_test_evaluate(df_pre_covid)
    prepare_train_test_evaluate(df_covid)

    # COVID dataset for both training and testing
    print("\nCOVID dataset evaluation:")
    prepare_train_test_evaluate(df_covid)

    # Pre-COVID dataset for both training and testing
    print("\nPre-COVID dataset evaluation:")
    prepare_train_test_evaluate(df_pre_covid)

Loading the datasets...
Training the model on COVID data and testing on Pre-COVID data...
Training the model...
Testing the model...
RMSE: 31.18
R^2: 0.10
MAPE: 67.48%
Training the model...
Testing the model...
RMSE: 29.73
R^2: 0.12
MAPE: 65.55%
Training the model on Pre-COVID data and testing on COVID data...
Training the model...
Testing the model...
RMSE: 29.73
R^2: 0.12
MAPE: 65.55%
Training the model...
Testing the model...
RMSE: 31.18
R^2: 0.10
MAPE: 67.48%

COVID dataset evaluation:
Training the model...
Testing the model...
RMSE: 31.18
R^2: 0.10
MAPE: 67.48%

Pre-COVID dataset evaluation:
Training the model...
Testing the model...
RMSE: 29.73
R^2: 0.12
MAPE: 65.55%


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler


def preprocess_df(df):
    quantile_001 = df['duration'].quantile(0.02)
    quantile_099 = df['duration'].quantile(0.98)
    df_filtered = df[(df['duration'] >= quantile_001) & (df['duration'] <= quantile_099) & (df['duration'] > 3)]
    return df_filtered


THRESHOLD=30

print("Loading the datasets...")
df_covid = pd.read_csv('merged_covid.csv')
df_pre_covid = pd.read_csv('merged_before.csv')

df_covid = preprocess_df(df_covid).dropna()
df_pre_covid = preprocess_df(df_pre_covid).dropna()

# quantile_001 = df['duration'].quantile(0.02)
# quantile_099 = df['duration'].quantile(0.98)
# df = df[(df['duration'] >= quantile_001) & (df['duration'] <= quantile_099) & (df['duration']>3)]


# features = ['Main_Category', 'Day', 'Is_Major_Incident', 'Num_Vehicles_Involved', 'Month', 'Hour', 'distance_to_CBD']
print("Preparing the COVID dataset for training...")
X_covid = df_covid.drop(['duration'],axis=1)#[features]
y_covid = (df_covid['duration'] > THRESHOLD).astype(int)  # Binary classification

print("Preparing the Pre-COVID dataset for testing...")
X_pre_covid = df_pre_covid.drop(['duration'],axis=1)#[features]
y_pre_covid = (df_pre_covid['duration'] > THRESHOLD).astype(int)

print("Standardizing the features...")
scaler = StandardScaler()
X_covid_scaled = scaler.fit_transform(X_covid)
X_pre_covid_scaled = scaler.transform(X_pre_covid)  # Use the same scaler for consistency

print("Training the model on COVID data...")
clf_covid = XGBClassifier(use_label_encoder=False)#RandomForestClassifier(random_state=42)
clf_covid.fit(X_covid_scaled, y_covid)

print("Testing the model on Pre-COVID data...")
y_pred_pre_covid = clf_covid.predict(X_pre_covid_scaled)
accuracy_pre_covid = accuracy_score(y_pre_covid, y_pred_pre_covid)
print(f'Accuracy on Pre-COVID data: {accuracy_pre_covid:.2f}')
print(classification_report(y_pre_covid, y_pred_pre_covid))
# ```

### Scenario 2: Train on Pre-COVID data, Test on COVID data

# ```python
print("Training the model on Pre-COVID data...")
clf_pre_covid = RandomForestClassifier(random_state=42)
clf_pre_covid.fit(X_pre_covid_scaled, y_pre_covid)

print("Testing the model on COVID data...")
y_pred_covid = clf_pre_covid.predict(X_covid_scaled)
accuracy_covid = accuracy_score(y_covid, y_pred_covid)
print(f'Accuracy on COVID data: {accuracy_covid:.2f}')
print(classification_report(y_covid, y_pred_covid))
# ```

### Scenario 3: Evaluate a model trained on merged and cleaned COVID and Pre-COVID data

# ```python
print("Merging the datasets for a comprehensive analysis...")
df_merged = pd.concat([df_covid, df_pre_covid])

print("Cleaning the merged dataset...")
df_merged_clean = df_merged.dropna(subset=['Main_Category', 'Day', 'Num_Vehicles_Involved', 'Month', 'Hour', 'distance_to_CBD', 'duration'])

features = ['Main_Category', 'Day', 'Num_Vehicles_Involved', 'Month', 'Hour', 'distance_to_CBD']
print("Preparing the data after cleaning...")
X = df_merged_clean[features]
y = (df_merged_clean['duration'] > THRESHOLD).astype(int)

print("Standardizing the features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Splitting the data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Training the model...")
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

print("Testing the model...")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


print("Preparing the Pre-COVID dataset for both training and testing...")
X_pre_covid_only = df_pre_covid[features]
y_pre_covid_only = (df_pre_covid['duration'] > THRESHOLD).astype(int)

print("Standardizing the Pre-COVID features for internal evaluation...")
X_pre_covid_only_scaled = scaler.fit_transform(X_pre_covid_only)

print("Splitting the Pre-COVID data into training and testing sets...")
X_pre_train, X_pre_test, y_pre_train, y_pre_test = train_test_split(X_pre_covid_only_scaled, y_pre_covid_only, test_size=0.2, random_state=42)

print("Training the model on Pre-COVID data (internal evaluation)...")
clf_pre_internal = XGBClassifier(use_label_encoder=False)#RandomForestClassifier(random_state=42)
clf_pre_internal.fit(X_pre_train, y_pre_train)

print("Testing the model on Pre-COVID data (internal evaluation)...")
y_pre_pred_internal = clf_pre_internal.predict(X_pre_test)
accuracy_pre_internal = accuracy_score(y_pre_test, y_pre_pred_internal)
print(f'Internal Accuracy on Pre-COVID data: {accuracy_pre_internal:.2f}')
print(classification_report(y_pre_test, y_pre_pred_internal))



print("Preparing the COVID dataset for both training and testing...")
X_covid_only = df_covid[features]
y_covid_only = (df_covid['duration'] > THRESHOLD).astype(int)

print("Standardizing the COVID features for internal evaluation...")
X_covid_only_scaled = scaler.fit_transform(X_covid_only)

print("Splitting the COVID data into training and testing sets...")
X_covid_train, X_covid_test, y_covid_train, y_covid_test = train_test_split(X_covid_only_scaled, y_covid_only, test_size=0.2, random_state=42)

print("Training the model on COVID data (internal evaluation)...")
clf_covid_internal =XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf_covid_internal.fit(X_covid_train, y_covid_train)

print("Testing the model on COVID data (internal evaluation)...")
y_covid_pred_internal = clf_covid_internal.predict(X_covid_test)
accuracy_covid_internal = accuracy_score(y_covid_test, y_covid_pred_internal)
print(f'Internal Accuracy on COVID data: {accuracy_covid_internal:.2f}')
print(classification_report(y_covid_test, y_covid_pred_internal))


Loading the datasets...
Preparing the COVID dataset for training...
Preparing the Pre-COVID dataset for testing...
Standardizing the features...
Training the model on COVID data...




Testing the model on Pre-COVID data...


ValueError: Found input variables with inconsistent numbers of samples: [49688, 28279]

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

THRESHOLD = 40
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler


def preprocess_df(df):
    quantile_001 = df['duration'].quantile(0.02)
    quantile_099 = df['duration'].quantile(0.98)
    df_filtered = df[(df['duration'] >= quantile_001) & (df['duration'] <= quantile_099) & (df['duration'] > 3)]
    return df_filtered


THRESHOLD=30

print("Loading the datasets...")
df_covid = pd.read_csv('merged_covid.csv')
df_pre_covid = pd.read_csv('merged_before.csv')

df_covid = preprocess_df(df_covid)
df_pre_covid = preprocess_df(df_pre_covid)


print("Loading the datasets...")
# df_covid = pd.read_csv('merged_covid.csv')
# df_pre_covid = pd.read_csv('merged_before.csv')

features = ['Main_Category', 'Day', 'Is_Major_Incident', 'Num_Vehicles_Involved', 'Month', 'Hour', 'distance_to_CBD']
print("Preparing the COVID dataset for training...")
X_covid = df_covid[features]
y_covid = (df_covid['duration'] > THRESHOLD).astype(int)  # Binary classification

print("Preparing the Pre-COVID dataset for testing...")
X_pre_covid = df_pre_covid[features]
y_pre_covid = (df_pre_covid['duration'] > THRESHOLD).astype(int)

print("Training the model on COVID data...")
clf_covid = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf_covid.fit(X_covid, y_covid)

print("Testing the model on Pre-COVID data...")
y_pred_pre_covid = clf_covid.predict(X_pre_covid)
accuracy_pre_covid = accuracy_score(y_pre_covid, y_pred_pre_covid)
print(f'Accuracy on Pre-COVID data: {accuracy_pre_covid:.2f}')
print(classification_report(y_pre_covid, y_pred_pre_covid))

# Scenario 2
print("Training the model on Pre-COVID data...")
clf_pre_covid = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf_pre_covid.fit(X_pre_covid, y_pre_covid)

print("Testing the model on COVID data...")
y_pred_covid = clf_pre_covid.predict(X_covid)
accuracy_covid = accuracy_score(y_covid, y_pred_covid)
print(f'Accuracy on COVID data: {accuracy_covid:.2f}')
print(classification_report(y_covid, y_pred_covid))

# Scenario 3
print("Merging the datasets for a comprehensive analysis...")
df_merged = pd.concat([df_covid, df_pre_covid])

print("Cleaning the merged dataset...")
df_merged_clean = df_merged.dropna(subset=features + ['duration'])

print("Preparing the data after cleaning...")
X = df_merged_clean[features]
y = (df_merged_clean['duration'] > THRESHOLD).astype(int)

print("Splitting the data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training the model...")
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

print("Testing the model...")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

### Below we perform internal evaluations for Pre-COVID and COVID datasets in isolation
## Pre-COVID Dataset Internal Evaluation
print("Splitting the Pre-COVID data into training and testing sets (internal)...")
X_pre_train, X_pre_test, y_pre_train, y_pre_test = train_test_split(X_pre_covid, y_pre_covid, test_size=0.2, random_state=42)

print("Training the model on Pre-COVID data (internal)...")
clf_pre_internal = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf_pre_internal.fit(X_pre_train, y_pre_train)

print("Testing the model on Pre-COVID data (internal)...")
y_pre_pred_internal = clf_pre_internal.predict(X_pre_test)
accuracy_pre_internal = accuracy_score(y_pre_test, y_pre_pred_internal)
print(f'Internal Accuracy on Pre-COVID data: {accuracy_pre_internal:.2f}')
print(classification_report(y_pre_test, y_pre_pred_internal))

## COVID Dataset Internal Evaluation
print("Splitting the COVID data into training and testing sets (internal)...")
X_covid_train, X_covid_test, y_covid_train, y_covid_test = train_test_split(X_covid, y_covid, test_size=0.2, random_state=42)

print("Training the model on COVID data (internal)...")
clf_covid_internal = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf_covid_internal.fit(X_covid_train, y_covid_train)

print("Testing the model on COVID data (internal)...")
y_covid_pred_internal = clf_covid_internal.predict(X_covid_test)
accuracy_covid_internal = accuracy_score(y_covid_test, y_covid_pred_internal)
print(f'Internal Accuracy on COVID data: {accuracy_covid_internal:.2f}')
print(classification_report(y_covid_test, y_covid_pred_internal))

Loading the datasets...
Loading the datasets...
Preparing the COVID dataset for training...


KeyError: "['Is_Major_Incident'] not in index"

In [25]:
# print("Preparing the Pre-COVID dataset for both training and testing...")
# X_pre_covid_only = df_pre_covid[features]
# y_pre_covid_only = (df_pre_covid['duration'] > THRESHOLD).astype(int)

# print("Standardizing the Pre-COVID features for internal evaluation...")
# X_pre_covid_only_scaled = scaler.fit_transform(X_pre_covid_only)

# print("Splitting the Pre-COVID data into training and testing sets...")
# X_pre_train, X_pre_test, y_pre_train, y_pre_test = train_test_split(X_pre_covid_only_scaled, y_pre_covid_only, test_size=0.2, random_state=42)

# print("Training the model on Pre-COVID data (internal evaluation)...")
# clf_pre_internal = RandomForestClassifier(random_state=42)
# clf_pre_internal.fit(X_pre_train, y_pre_train)

# print("Testing the model on Pre-COVID data (internal evaluation)...")
# y_pre_pred_internal = clf_pre_internal.predict(X_pre_test)
# accuracy_pre_internal = accuracy_score(y_pre_test, y_pre_pred_internal)
# print(f'Internal Accuracy on Pre-COVID data: {accuracy_pre_internal:.2f}')
# print(classification_report(y_pre_test, y_pre_pred_internal))



# print("Preparing the COVID dataset for both training and testing...")
# X_covid_only = df_covid[features]
# y_covid_only = (df_covid['duration'] > THRESHOLD).astype(int)

# print("Standardizing the COVID features for internal evaluation...")
# X_covid_only_scaled = scaler.fit_transform(X_covid_only)

# print("Splitting the COVID data into training and testing sets...")
# X_covid_train, X_covid_test, y_covid_train, y_covid_test = train_test_split(X_covid_only_scaled, y_covid_only, test_size=0.2, random_state=42)

# print("Training the model on COVID data (internal evaluation)...")
# clf_covid_internal = RandomForestClassifier(random_state=42)
# clf_covid_internal.fit(X_covid_train, y_covid_train)

# print("Testing the model on COVID data (internal evaluation)...")
# y_covid_pred_internal = clf_covid_internal.predict(X_covid_test)
# accuracy_covid_internal = accuracy_score(y_covid_test, y_covid_pred_internal)
# print(f'Internal Accuracy on COVID data: {accuracy_covid_internal:.2f}')
# print(classification_report(y_covid_test, y_covid_pred_internal))


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

THRESHOLD = 30

print("Loading the datasets...")
df_covid = pd.read_csv('merged_covid.csv')
df_pre_covid = pd.read_csv('merged_before.csv')

quantile_001 = pd.concat([df_covid, df_pre_covid])['duration'].quantile(0.01)
quantile_099 = pd.concat([df_covid, df_pre_covid])['duration'].quantile(0.99)
df_covid = df_covid[(df_covid['duration'] >= quantile_001) & (df_covid['duration'] <= quantile_099)]
df_pre_covid = df_pre_covid[(df_pre_covid['duration'] >= quantile_001) & (df_pre_covid['duration'] <= quantile_099)]




features = ['Main_Category', 'Day', 'Is_Major_Incident', 'Num_Vehicles_Involved', 'Month', 'Hour', 'distance_to_CBD']

# For regression, we directly use 'duration' without transforming it into binary categories
y_covid = df_covid['duration']
y_pre_covid = df_pre_covid['duration']

print("Standardizing the features...")
scaler = StandardScaler()
X_covid_scaled = scaler.fit_transform(df_covid[features])
X_pre_covid_scaled = scaler.transform(df_pre_covid[features])  # Use the same scaler for consistency

# Scenario 1: Train on COVID data, Test on Pre-COVID data
print("Training the model on COVID data for regression...")
regressor_covid = RandomForestRegressor(random_state=42)
regressor_covid.fit(X_covid_scaled, y_covid)

print("Testing the model on Pre-COVID data for regression...")
y_pred_pre_covid = regressor_covid.predict(X_pre_covid_scaled)
mse_pre_covid = mean_squared_error(y_pre_covid, y_pred_pre_covid)
print(f'RMSE on Pre-COVID data: {np.sqrt(mse_pre_covid):.2f}')
print(f'R2 Score on Pre-COVID data: {r2_score(y_pre_covid, y_pred_pre_covid):.2f}')

# Scenario 2: Train on Pre-COVID data, Test on COVID data
print("Training the model on Pre-COVID data for regression...")
regressor_pre_covid = RandomForestRegressor(random_state=42)
regressor_pre_covid.fit(X_pre_covid_scaled, y_pre_covid)

print("Testing the model on COVID data for regression...")
y_pred_covid = regressor_pre_covid.predict(X_covid_scaled)
mse_covid = mean_squared_error(y_covid, y_pred_covid)
print(f'RMSE on COVID data: {np.sqrt(mse_covid):.2f}')
print(f'R2 Score on COVID data: {r2_score(y_covid, y_pred_covid):.2f}')

# Scenario 3: Train and test on merged and cleaned COVID and Pre-COVID data
print("Merging and cleaning the datasets for a comprehensive analysis...")
df_merged = pd.concat([df_covid, df_pre_covid])
df_merged_clean = df_merged.dropna(subset=features + ['duration'])

X = df_merged_clean[features]
y = df_merged_clean['duration']

print("Standardizing the features for merged data...")
X_scaled = scaler.fit_transform(X)

print("Splitting the merged data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Training the regression model on merged data...")
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

print("Testing the regression model on merged data...")
y_pred = model.predict(X_test)
mse_merged = mean_squared_error(y_test, y_pred)
print(f'RMSE on Merged data: {np.sqrt(mse_merged):.2f}')
print(f'R2 Score on Merged data: {r2_score(y_test, y_pred):.2f}')



import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming you have already loaded and cleaned your datasets as per your previous steps
df_covid = pd.read_csv('merged_covid.csv')
df_pre_covid = pd.read_csv('merged_before.csv')

# Merging datasets 
df_merged = pd.concat([df_covid, df_pre_covid])

# Removing outliers based on the duration column across the whole dataset
quantile_001 = df_merged['duration'].quantile(0.01)
quantile_099 = df_merged['duration'].quantile(0.99)
df_merged = df_merged[(df_merged['duration'] >= quantile_001) & (df_merged['duration'] <= quantile_099)]

# Dropping missing values in features and target column
df_merged_clean = df_merged.dropna(subset=features + ['duration'])

# Features and target variable
features = ['Main_Category', 'Day', 'Is_Major_Incident', 'Num_Vehicles_Involved', 'Month', 'Hour', 'distance_to_CBD']
X = df_merged_clean[features]
y = df_merged_clean['duration']

# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the merged dataset into training & testing sets with random shuffling
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) # 80% training, 20% testing

# Initializing and training the RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Calculating and printing RMSE and R2 Score 
mse = mean_squared_error(y_test, y_pred)
print(f'RMSE on Merged and randomly sampled data: {np.sqrt(mse):.2f}')
print(f'R2 Score on Merged and randomly sampled data: {r2_score(y_test, y_pred):.2f}')

print("Preparing the Pre-COVID dataset for regression (internal evaluation)...")
X_pre_covid_only_scaled = scaler.fit_transform(df_pre_covid[features])
y_pre_covid_only = df_pre_covid['duration']

print("Splitting the Pre-COVID data into training and testing sets (internal evaluation)...")
X_pre_train, X_pre_test, y_pre_train, y_pre_test = train_test_split(X_pre_covid_only_scaled, y_pre_covid_only, test_size=0.2, random_state=42)

print("Training the regression model on Pre-COVID data (internal evaluation)...")
regressor_pre_internal = RandomForestRegressor(random_state=42)
regressor_pre_internal.fit(X_pre_train, y_pre_train)

print("Testing the regression model on Pre-COVID data (internal evaluation)...")
y_pre_pred_internal = regressor_pre_internal.predict(X_pre_test)
mse_pre_internal = mean_squared_error(y_pre_test, y_pre_pred_internal)
print(f'RMSE on Pre-COVID data (internal evaluation): {np.sqrt(mse_pre_internal):.2f}')
print(f'R2 Score on Pre-COVID data (internal evaluation): {r2_score(y_pre_test, y_pre_pred_internal):.2f}')


print("Preparing the COVID dataset for regression (internal evaluation)...")
X_covid_only_scaled = scaler.fit_transform(df_covid[features])
y_covid_only = df_covid['duration']

print("Splitting the COVID data into training and testing sets (internal evaluation)...")
X_covid_train, X_covid_test, y_covid_train, y_covid_test = train_test_split(X_covid_only_scaled, y_covid_only, test_size=0.2, random_state=42)

print("Training the regression model on COVID data (internal evaluation)...")
regressor_covid_internal = RandomForestRegressor(random_state=42)
regressor_covid_internal.fit(X_covid_train, y_covid_train)

print("Testing the regression model on COVID data (internal evaluation)...")
y_covid_pred_internal = regressor_covid_internal.predict(X_covid_test)
mse_covid_internal = mean_squared_error(y_covid_test, y_covid_pred_internal)
print(f'RMSE on COVID data (internal evaluation): {np.sqrt(mse_covid_internal):.2f}')
print(f'R2 Score on COVID data (internal evaluation): {r2_score(y_covid_test, y_covid_pred_internal):.2f}')


Loading the datasets...
Standardizing the features...


KeyError: "['Is_Major_Incident'] not in index"

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# Load your datasets
df_covid = pd.read_csv('merged_covid.csv')
df_pre_covid = pd.read_csv('merged_before.csv')

# Define the relevant features for your models
# features = ['Main_Category', 'Day', 'Is_Major_Incident', 'Num_Vehicles_Involved', 'Month', 'Hour', 'distance_to_CBD']

# Merging and cleaning datasets for consistent preprocessing across scenarios
df_merged = pd.concat([df_covid, df_pre_covid])
quantile_001 = df_merged['duration'].quantile(0.01)
quantile_099 = df_merged['duration'].quantile(0.99)

df_merged = df_merged[(df_merged['duration'] >= quantile_001) & (df_merged['duration'] <= quantile_099)]
df_merged_clean = df_merged.dropna()#subset=features + ['duration'])

# Split merged_clean for a comprehensive dataset analysis
X = df_merged_clean.drop(['duration'],axis=1)#[features]
y = df_merged_clean['duration']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost regressor on the merged dataset
xgb_model_merged = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10, random_state=42)

xgb_model_merged.fit(X_train, y_train)

# Making predictions on the test set
y_pred = xgb_model_merged.predict(X_test)

# Calculating and displaying RMSE and R2 Score for Merged data
mse_merged = mean_squared_error(y_test, y_pred)
print(f'RMSE on Merged data: {np.sqrt(mse_merged):.2f}')
print(f'R2 Score on Merged data: {r2_score(y_test, y_pred):.2f}')

# Now, for scenario-specific models, extract and directly apply preprocessing from the merged dataset:
X_covid = df_covid.drop(['duration'],axis=1)
y_covid = df_covid['duration']
X_pre_covid = df_pre_covid.drop(['duration'],axis=1)
y_pre_covid = df_pre_covid['duration']

# Sort dataframe based on features list to avoid missing value issues if present
X_covid = X_covid.drop(['duration'],axis=1)#.dropna()
X_pre_covid = X_pre_covid.drop(['duration'],axis=1)#.loc[:, features].dropna()

# No need for scaling due to XGBoost's handling of raw features

# Scenario 1: Training on COVID, testing on Pre-COVID
xgb_model_covid = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10, random_state=42)

xgb_model_covid.fit(X_covid, y_covid)

# Prediction on Pre-COVID data
y_pred_pre_covid = xgb_model_covid.predict(X_pre_covid)
mse_pre_covid = mean_squared_error(y_pre_covid, y_pred_pre_covid)

print(f'RMSE on Pre-COVID data (trained on COVID): {np.sqrt(mse_pre_covid):.2f}')
print(f'R2 Score on Pre-COVID data (trained on COVID): {r2_score(y_pre_covid, y_pred_pre_covid):.2f}')

RMSE on Merged data: 38.25
R2 Score on Merged data: -0.05


KeyError: "['duration'] not found in axis"

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming you have already loaded and cleaned your datasets as per your previous steps
df_covid = pd.read_csv('merged_covid.csv')
df_pre_covid = pd.read_csv('merged_before.csv')

# Merging datasets 
df_merged = pd.concat([df_covid, df_pre_covid])

# Removing outliers based on the duration column across the whole dataset
quantile_001 = df_merged['duration'].quantile(0.01)
quantile_099 = df_merged['duration'].quantile(0.99)
df_merged = df_merged[(df_merged['duration'] >= quantile_001) & (df_merged['duration'] <= quantile_099)]

# Dropping missing values in features and target column
df_merged_clean = df_merged.dropna()#subset=features + ['duration'])

# Features and target variable
# features = ['Main_Category', 'Day', 'Is_Major_Incident', 'Num_Vehicles_Involved', 'Month', 'Hour', 'distance_to_CBD']
X = df_merged_clean.drop(['duration'],axis=1)#[features]
y = df_merged_clean['duration']

# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the merged dataset into training & testing sets with random shuffling
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) # 80% training, 20% testing

# Initializing and training the RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Calculating and printing RMSE and R2 Score 
mse = mean_squared_error(y_test, y_pred)
print(f'RMSE on Merged and randomly sampled data: {np.sqrt(mse):.2f}')
print(f'R2 Score on Merged and randomly sampled data: {r2_score(y_test, y_pred):.2f}')

KeyboardInterrupt: 

In [23]:
print("Preparing the Pre-COVID dataset for regression (internal evaluation)...")
X_pre_covid_only_scaled = scaler.fit_transform(df_pre_covid[features])
y_pre_covid_only = df_pre_covid['duration']

print("Splitting the Pre-COVID data into training and testing sets (internal evaluation)...")
X_pre_train, X_pre_test, y_pre_train, y_pre_test = train_test_split(X_pre_covid_only_scaled, y_pre_covid_only, test_size=0.2, random_state=42)

print("Training the regression model on Pre-COVID data (internal evaluation)...")
regressor_pre_internal = RandomForestRegressor(random_state=42)
regressor_pre_internal.fit(X_pre_train, y_pre_train)

print("Testing the regression model on Pre-COVID data (internal evaluation)...")
y_pre_pred_internal = regressor_pre_internal.predict(X_pre_test)
mse_pre_internal = mean_squared_error(y_pre_test, y_pre_pred_internal)
print(f'RMSE on Pre-COVID data (internal evaluation): {np.sqrt(mse_pre_internal):.2f}')
print(f'R2 Score on Pre-COVID data (internal evaluation): {r2_score(y_pre_test, y_pre_pred_internal):.2f}')


Preparing the Pre-COVID dataset for regression (internal evaluation)...
Splitting the Pre-COVID data into training and testing sets (internal evaluation)...
Training the regression model on Pre-COVID data (internal evaluation)...
Testing the regression model on Pre-COVID data (internal evaluation)...
RMSE on Pre-COVID data (internal evaluation): 37.65
R2 Score on Pre-COVID data (internal evaluation): -0.05


In [24]:
print("Preparing the COVID dataset for regression (internal evaluation)...")
X_covid_only_scaled = scaler.fit_transform(df_covid[features])
y_covid_only = df_covid['duration']

print("Splitting the COVID data into training and testing sets (internal evaluation)...")
X_covid_train, X_covid_test, y_covid_train, y_covid_test = train_test_split(X_covid_only_scaled, y_covid_only, test_size=0.2, random_state=42)

print("Training the regression model on COVID data (internal evaluation)...")
regressor_covid_internal = RandomForestRegressor(random_state=42)
regressor_covid_internal.fit(X_covid_train, y_covid_train)

print("Testing the regression model on COVID data (internal evaluation)...")
y_covid_pred_internal = regressor_covid_internal.predict(X_covid_test)
mse_covid_internal = mean_squared_error(y_covid_test, y_covid_pred_internal)
print(f'RMSE on COVID data (internal evaluation): {np.sqrt(mse_covid_internal):.2f}')
print(f'R2 Score on COVID data (internal evaluation): {r2_score(y_covid_test, y_covid_pred_internal):.2f}')


Preparing the COVID dataset for regression (internal evaluation)...
Splitting the COVID data into training and testing sets (internal evaluation)...
Training the regression model on COVID data (internal evaluation)...
Testing the regression model on COVID data (internal evaluation)...
RMSE on COVID data (internal evaluation): 37.83
R2 Score on COVID data (internal evaluation): -0.06
