### Import Libraries

In [11]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras import layers

from sklearn.metrics import classification_report

## Create ML Models, Train Them, and Make Predictions

First, we prepare our data for training

In [12]:
train_data = pd.read_csv('prepared_data/train_data.csv')
test_data = pd.read_csv('prepared_data/test_data.csv')

In [13]:
target = 'charges'

y_train = train_data[target].copy()
y_test = test_data[target].copy()

# Drop 'charges' column
X_train = train_data.drop(target, axis=1)
X_test = test_data.drop(target, axis=1)

Here are the classification models we are using to make predictions:
* Linear Regression
* Random Forest Regressor
* XGBoost Regressor
* Neural Network

Now, we set them up with default parameters.

In [14]:
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)
xgb = XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
nn = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])
nn.compile(optimizer='adam', loss='mae')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Then, we fit the models with the data we prepared.

In [15]:
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
nn.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 13151.5039 - val_loss: 12235.1270
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 13400.0674 - val_loss: 11532.8086
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 12246.3828 - val_loss: 10456.7451
Epoch 4/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 10985.8271 - val_loss: 9460.2344
Epoch 5/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 10090.8096 - val_loss: 8919.2451
Epoch 6/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 8974.9473 - val_loss: 8949.1543
Epoch 7/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 9298.2461 - val_loss: 8950.2246
Epoch 8/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 9021.3379 - val_loss: 888

<keras.src.callbacks.history.History at 0x19a62f15a10>

In [16]:
y_train_lr = lr.predict(X_train)
y_test_lr = lr.predict(X_test)

y_train_rf = rf.predict(X_train)
y_test_rf = rf.predict(X_test)

y_train_xgb = xgb.predict(X_train)
y_test_xgb = xgb.predict(X_test)

y_train_nn = nn.predict(X_train)
y_test_nn = nn.predict(X_test)

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 


In [17]:
import numpy as np
from sklearn.metrics import mean_squared_error


def evaluate_model(y_true, y_pred):
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"MSE: {mean_squared_error(y_true, y_pred):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}")
    print(f"R²: {r2_score(y_true, y_pred):.2f}")

print("Training Performance")
print("Logistic Regression")
evaluate_model(y_train, y_train_lr)
print("Random Forest")
evaluate_model(y_train, y_train_rf)
print("XGBoost")
evaluate_model(y_train, y_train_xgb)
print("Newral Network")
evaluate_model(y_train, y_train_nn)

Training Performance
Logistic Regression
MAE: 4202.64
MSE: 37237484.00
RMSE: 6102.25
R²: 0.74
Random Forest
MAE: 1027.60
MSE: 3348540.21
RMSE: 1829.90
R²: 0.98
XGBoost
MAE: 494.79
MSE: 699820.92
RMSE: 836.55
R²: 1.00
Newral Network
MAE: 7069.43
MSE: 149994723.88
RMSE: 12247.23
R²: -0.04


We can see that XGBoost is the best performance for training, followed by Random Forest, Linear Regression, and Neural Network

In [18]:
print("Test Performance")
print("Logistic Regression")
evaluate_model(y_test, y_test_lr)
print("Random Forest")
evaluate_model(y_test, y_test_rf)
print("XGBoost")
evaluate_model(y_test, y_test_xgb)
print("Newral Network")
evaluate_model(y_test, y_test_nn)

Test Performance
Logistic Regression
MAE: 4197.09
MSE: 33806854.35
RMSE: 5814.37
R²: 0.78
Random Forest
MAE: 2752.95
MSE: 22217504.56
RMSE: 4713.54
R²: 0.86
XGBoost
MAE: 2806.07
MSE: 24193508.01
RMSE: 4918.69
R²: 0.84
Newral Network
MAE: 7105.48
MSE: 151514198.52
RMSE: 12309.11
R²: 0.02
