RANDOM FOREST REGRESSION


In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install scikit-learn

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder


In [5]:
df = pd.read_csv(r"medical_insurance.csv")

categorical_cols = ['sex', 'smoker', 'region']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [6]:
# Our target variable for regression is 'charges'
X = df.drop('charges', axis=1)  # Features are all columns except 'charges'
y = df['charges']      

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
# Create the Random Forest Regressor
regressor = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

In [9]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

Mean Squared Error (MSE): 5743265.59
R-squared (R²): 0.96


In [None]:
# Visualizing the predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2) # Diagonal line for perfect prediction
plt.xlabel("Actual Charges")
plt.ylabel("Predicted Charges")
plt.title("Random Forest Regression: Actual vs. Predicted Charges")
plt.grid(True)
plt.show()