In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv("insurance.csv")


In [None]:
# Display the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data, columns=['sex', 'smoker', 'region'])

# Split the data into features (X) and target variable (y)
X = data.drop('charges', axis=1)
y = data['charges']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len (X), len(X_train)

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


(1338, 1070)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train a linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test set
lr_predictions = lr_model.predict(X_test)

# Calculate error and accuracy metrics
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_predictions))
lr_r2 = r2_score(y_test, lr_predictions)

print("Linear Regression:")
print("RMSE:", lr_rmse)
print("R^2 Score:", lr_r2)


Linear Regression:
RMSE: 5796.2846592762735
R^2 Score: 0.7835929767120723


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Calculate error and accuracy metrics
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
rf_r2 = r2_score(y_test, rf_predictions)

print("Random Forest:")
print("RMSE:", rf_rmse)
print("R^2 Score:", rf_r2)


Random Forest:
RMSE: 4588.228101917115
R^2 Score: 0.864399297096109


In [None]:
from sklearn.ensemble import AdaBoostRegressor

# Train AdaBoost Regressor
adaboost_model = AdaBoostRegressor(random_state=42)
adaboost_model.fit(X_train, y_train)
adaboost_predictions = adaboost_model.predict(X_test)

# Calculate error and accuracy metrics
adaboost_rmse = np.sqrt(mean_squared_error(y_test, adaboost_predictions))
adaboost_r2 = r2_score(y_test, adaboost_predictions)

print("AdaBoost:")
print("RMSE:", adaboost_rmse)
print("R^2 Score:", adaboost_r2)


AdaBoost:
RMSE: 5267.060510339542
R^2 Score: 0.8213065823275454


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Train Gradient Boosting Regressor
gradientboost_model = GradientBoostingRegressor(random_state=42)
gradientboost_model.fit(X_train, y_train)
gradientboost_predictions = gradientboost_model.predict(X_test)

# Calculate error and accuracy metrics
gradientboost_rmse = np.sqrt(mean_squared_error(y_test, gradientboost_predictions))
gradientboost_r2 = r2_score(y_test, gradientboost_predictions)

print("Gradient Boosting:")
print("RMSE:", gradientboost_rmse)
print("R^2 Score:", gradientboost_r2)


Gradient Boosting:
RMSE: 4329.210862471454
R^2 Score: 0.8792771669587912


In [None]:
from xgboost import XGBRegressor

# Train XGBoost Regressor
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# Calculate error and accuracy metrics
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_predictions))
xgb_r2 = r2_score(y_test, xgb_predictions)

print("XGBoost:")
print("RMSE:", xgb_rmse)
print("R^2 Score:", xgb_r2)


XGBoost:
RMSE: 4738.609614038749
R^2 Score: 0.8553648660539922
