### Dataset: https://www.kaggle.com/mirichoi0218/insurance

### Import the library 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Import Data

In [2]:
data = pd.read_csv('S07_datasets_13720_18513_insurance.csv')

In [3]:
X = data.iloc[:,:-1]

In [4]:
Y = data.iloc[:,-1]

In [5]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Label encoding

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [7]:
X['sex'] = le.fit_transform(X['sex'])

In [8]:
X['smoker'] = le.fit_transform(X['smoker'])

In [9]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,southwest
1,18,1,33.770,1,0,southeast
2,28,1,33.000,3,0,southeast
3,33,1,22.705,0,0,northwest
4,32,1,28.880,0,0,northwest
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest
1334,18,0,31.920,0,0,northeast
1335,18,0,36.850,0,0,southeast
1336,21,0,25.800,0,0,southwest


## One hot encoding

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [11]:
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [5])], remainder='passthrough')

In [12]:
X = columnTransformer.fit_transform(X)

In [13]:
print(X)

[[ 0.    0.    0.   ... 27.9   0.    1.  ]
 [ 0.    0.    1.   ... 33.77  1.    0.  ]
 [ 0.    0.    1.   ... 33.    3.    0.  ]
 ...
 [ 0.    0.    1.   ... 36.85  0.    0.  ]
 [ 0.    0.    0.   ... 25.8   0.    0.  ]
 [ 0.    1.    0.   ... 29.07  0.    1.  ]]


### Train test split

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)

### Building model

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [28]:
model = RandomForestRegressor(n_estimators = 50, random_state = 10)

In [29]:
model.fit(X_train,y_train)

RandomForestRegressor(n_estimators=50, random_state=10)

### Making the prediction

In [30]:
y_pred = model.predict(X_test)

In [31]:
#print(y_pred)

### Comparing the results

In [32]:
comparision = pd.DataFrame()

In [33]:
comparision['Actual'] = y_test

In [34]:
comparision['predicted'] = y_pred

In [35]:
comparision

Unnamed: 0,Actual,predicted
559,1646.42970,2055.804508
1087,11353.22760,12331.167629
1020,8798.59300,9067.230709
460,10381.47870,10963.738111
802,2103.08000,2214.481913
...,...,...
682,40103.89000,41386.162312
629,42983.45850,47177.972445
893,44202.65360,44519.567016
807,2136.88225,2068.610071


### Evaluatation

In [36]:
from sklearn.metrics import r2_score

In [37]:
r2_score(y_test, y_pred)

0.8537063882750936