Import Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error


Load the data 

In [2]:
file_path = "data/TG_T_CashValues_Rel.csv"
data = pd.read_csv(file_path)
new_data = data.copy(deep=True)

Prepare the Data

In [3]:

label_encoder = LabelEncoder()
data['Gender_encoded'] = label_encoder.fit_transform(data['Gender'])

# Prepare the features; 'Age' only for simple Linear Regression to simulate a "single variable" regression
X_single = data[['Age']]  # Single feature for simple Linear Regression
X_multiple = data[['Gender_encoded', 'Age', 'Dur']]  # Multiple features for Multiple Linear Regression
y = data['ppv']

X_train_single, X_test_single, y_train, y_test = train_test_split(X_single, y, test_size=0.2, random_state=42)
X_train_multiple, X_test_multiple, _, _ = train_test_split(X_multiple, y, test_size=0.2, random_state=42)



Initialize, Train, and Predict with Each Model

In [4]:

# Simple Linear Regression
lr_simple_model = LinearRegression()
lr_simple_model.fit(X_train_single, y_train)
lr_simple_predictions = lr_simple_model.predict(X_test_single)
lr_simple_rmse = np.sqrt(mean_squared_error(y_test, lr_simple_predictions))
lr_simple_mae = mean_absolute_error(y_test, lr_simple_predictions)

# Multiple Linear Regression
lr_multiple_model = LinearRegression()
lr_multiple_model.fit(X_train_multiple, y_train)
lr_multiple_predictions = lr_multiple_model.predict(X_test_multiple)
lr_multiple_rmse = np.sqrt(mean_squared_error(y_test, lr_multiple_predictions))
lr_multiple_mae = mean_absolute_error(y_test, lr_multiple_predictions)

# Random Forest Regression
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_multiple, y_train)
rf_predictions = rf_model.predict(X_test_multiple)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
rf_mae = mean_absolute_error(y_test, rf_predictions)

# Gradient Boosting Regression
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_multiple, y_train)
gb_predictions = gb_model.predict(X_test_multiple)
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_predictions))
gb_mae = mean_absolute_error(y_test, gb_predictions)

Compare the Results

In [5]:

# Print the results
print("Simple Linear Regression RMSE:", lr_simple_rmse, "MAE:", lr_simple_mae)
print("Multiple Linear Regression RMSE:", lr_multiple_rmse, "MAE:", lr_multiple_mae)
print("Random Forest RMSE:", rf_rmse, "MAE:", rf_mae)
print("Gradient Boosting RMSE:", gb_rmse, "MAE:", gb_mae)


Simple Linear Regression RMSE: 5.73700580970332 MAE: 4.781813736228542
Multiple Linear Regression RMSE: 1.2769828718583154 MAE: 1.0668901326133526
Random Forest RMSE: 0.052948978115997254 MAE: 0.02887988113036927
Gradient Boosting RMSE: 0.12836318310884826 MAE: 0.07578968784866744


### Random Forest Regression will be the chosen algorithm, since it provides the lowest RMSE(Root Mean Squared Error)

In [7]:
new_data['Gender_encoded'] = label_encoder.transform(new_data['Gender'])  
X_new = new_data[['Gender_encoded', 'Age', 'Dur']] 

# Predict using the trained Random Forest model
new_predictions = rf_model.predict(X_new)

# Optionally, attach predictions back to the new_data DataFrame
new_data['Predicted_ppv'] = new_predictions


In [8]:
new_data

Unnamed: 0,Gender,Age,Dur,ppv,Gender_encoded,Predicted_ppv
0,Male,0,20,15.198437,1,15.198485
1,Male,0,21,15.741015,1,15.740537
2,Male,0,22,16.267189,1,16.267115
3,Male,0,23,16.777448,1,16.777270
4,Male,0,24,17.272269,1,17.271443
...,...,...,...,...,...,...
5265,Female,78,2,1.925421,0,1.899450
5266,Female,78,3,2.777638,0,2.764200
5267,Female,79,1,1.000000,0,1.000000
5268,Female,79,2,1.920897,0,1.895051


In [13]:
import joblib

# Save the model to disk
joblib.dump(rf_model, 'data/random_forest_model.pkl')

# Load the model from disk (e.g., in another application or system)
loaded_rf_model = joblib.load('data/random_forest_model.pkl')

# Use loaded model to make predictions
predictions = loaded_rf_model.predict(X_new)