In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

### **Pre-processing data**

In [3]:
df = pd.read_csv('temp_data.csv')

In [4]:
df = df.drop(['age', 'sex', 'region'], axis = 1)

In [5]:
df

Unnamed: 0,bmi,children,smoker,charges
0,27.900,0,yes,16884.92400
1,33.770,1,no,1725.55230
2,33.000,3,no,4449.46200
3,22.705,0,no,21984.47061
4,28.880,0,no,3866.85520
...,...,...,...,...
1333,30.970,3,no,10600.54830
1334,31.920,0,no,2205.98080
1335,36.850,0,no,1629.83350
1336,25.800,0,no,2007.94500


### **Transforming the categorical column**

In [6]:
df['smoker'] = pd.factorize(df['smoker'])[0]
df

Unnamed: 0,bmi,children,smoker,charges
0,27.900,0,0,16884.92400
1,33.770,1,1,1725.55230
2,33.000,3,1,4449.46200
3,22.705,0,1,21984.47061
4,28.880,0,1,3866.85520
...,...,...,...,...
1333,30.970,3,1,10600.54830
1334,31.920,0,1,2205.98080
1335,36.850,0,1,1629.83350
1336,25.800,0,1,2007.94500


### **Split the data into input and target data**

In [7]:
y = df.loc[:, ['charges']].values
X = df.drop(['charges'], axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [9]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)
y_predict = regressor.predict(X_test_scaled)
r2_score = regressor.score(X_test_scaled, y_test)
print(r2_score * 100, '%')

In [11]:
regressor_forest =  RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor_forest.fit(X_train, y_train)
y_predict = regressor_forest.predict(X_test)
r2_score = regressor.score(X_test_scaled, y_test)
print(r2_score * 100, '%')

  regressor_forest.fit(X_train, y_train)
