# Building model

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

### Pre-processing data

In [None]:
df = pd.read_csv('data/training.csv', index_col=0)
categorical_cols = ['site_type', 'nn_region', 'is_novo_nordisk_trial', 'is_top_20_sponsor', 'Heart Failure',
       'Cardiovascular Stability', 'High Consent Emphasis',
       'Technology-Enabled Monitoring', 'Weight Monitoring',
       'Medication and Treatment History', 'Kidney Function',
       'Cardiovascular Events', 'trial_phase_III']
numerical_cols = list(set(df.columns) - set(categorical_cols) - {'efficiency'})

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Drop rows that still have NaNs (missing efficiency)
df = df.dropna(subset=['efficiency'])

### Training model

In [None]:
# Define the model
model = RandomForestRegressor(n_estimators=1000, random_state=2)

# Bundle preprocessing and modeling code in a pipeline
reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# Splitting the data
df = df.dropna(subset=['efficiency'])
X = df.drop('efficiency', axis=1)
y = df['efficiency']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing of training data, fit model
reg.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
print("Model training complete. You can now make predictions.")

---
### Results

In [None]:
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    nonzero = y_true != 0
    return np.mean(np.abs((y_true[nonzero] - y_pred[nonzero]) / y_true[nonzero])) * 100

y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

In [None]:
# Calculating MSE
mse_train = mean_squared_error(y_train, y_train_pred)
print(f'Mean Squared Training Error: {mse_train:.3f}')
mse_test = mean_squared_error(y_test, y_test_pred)
print(f'Mean Squared Testing Error: {mse_test:.3f}')

# Calculate MAPE
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
print(f'Mean Absolute Percentage Training Error: {mape_train:.3f} %')
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
print(f'Mean Absolute Percentage Testing Error: {mape_test:.3f} %')

In [None]:
y_true = y_train
y_pred = y_train_pred

total = 0
for i, (true, pred) in enumerate(zip(y_true, y_pred)):
    squared_error = (true - pred)**2
    total += squared_error

    if squared_error > 10:
        print(f'{i}: True: {true}, Predicted: {pred}, Squared Error: {squared_error}')

print(f'Total squared error: {total}, MSE: {total / len(y_true)}')