In [46]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder

In [47]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [49]:
# Encoding
LE = LabelEncoder()
features = ['sex', 'smoker', 'region']
for f in features:
    df[f] = LE.fit_transform(df[f])

df.loc[:10, ['sex', 'smoker', 'region']]

Unnamed: 0,sex,smoker,region
0,0,1,3
1,1,0,2
2,1,0,2
3,1,0,1
4,1,0,1
5,0,0,2
6,0,0,2
7,0,0,1
8,1,0,0
9,0,0,1


In [36]:
X = df.drop(['charges', 'region'], axis = 1)
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

SS = StandardScaler()
X_train = SS.fit_transform(X_train)
X_test = SS.transform(X_test)

In [50]:
from sklearn.preprocessing import PolynomialFeatures

quad = PolynomialFeatures(degree=2)

X = df.drop("charges", axis=1)
y = df["charges"].copy()

x_quad = quad.fit_transform(X)

xtrain, xtest, ytrain, ytest = train_test_split(x_quad, y, test_size=0.2, random_state=42)

lin_reg.fit(xtrain, ytrain)
lin_reg.score(xtrain, ytrain)

0.8374016465043517

In [52]:
# Create a Linear Regression model
lin_reg = LinearRegression()

# Train the model
lin_reg.fit(X_train, y_train)

# Make predictions
y_pred_lin_reg = lin_reg.predict(X_test)

# Evaluate the model
print("Linear Regression")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_lin_reg))
print("R^2 Score:", r2_score(y_test, y_pred_lin_reg))


Linear Regression
Mean Squared Error: 33979257.050808206
R^2 Score: 0.7811302113434097


In [39]:
lin_reg.score(X_train, y_train)

0.7410888590280913

In [41]:
# Create a Random Forest Regressor model
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_reg.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_reg.predict(X_test)

# Evaluate the model
print("Random Forest Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("R^2 Score:", r2_score(y_test, y_pred_rf))


Random Forest Regressor
Mean Squared Error: 21971082.0887227
R^2 Score: 0.8584781860849734


In [20]:
from sklearn.pipeline import make_pipeline

# Pipeline for Linear Regression
pipe_lin_reg = make_pipeline(StandardScaler(), LinearRegression())
pipe_lin_reg.fit(X_train, y_train)
y_pred_pipe_lin_reg = pipe_lin_reg.predict(X_test)
print("Pipeline Linear Regression")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_pipe_lin_reg))
print("R^2 Score:", r2_score(y_test, y_pred_pipe_lin_reg))

# Pipeline for Random Forest Regressor
pipe_rf = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42))
pipe_rf.fit(X_train, y_train)
y_pred_pipe_rf = pipe_rf.predict(X_test)
print("Pipeline Random Forest Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_pipe_rf))
print("R^2 Score:", r2_score(y_test, y_pred_pipe_rf))

# Pipeline for Gradient Boosting Regressor
pipe_gb = make_pipeline(StandardScaler(), GradientBoostingRegressor(random_state=42))
pipe_gb.fit(X_train, y_train)
y_pred_pipe_gb = pipe_gb.predict(X_test)
print("Pipeline Gradient Boosting Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_pipe_gb))
print("R^2 Score:", r2_score(y_test, y_pred_pipe_gb))


Pipeline Linear Regression
Mean Squared Error: 33635210.43117844
R^2 Score: 0.7833463107364536
Pipeline Random Forest Regressor
Mean Squared Error: 20898625.73754565
R^2 Score: 0.8653861739369313
Pipeline Gradient Boosting Regressor
Mean Squared Error: 18932125.40806312
R^2 Score: 0.8780529462228404
