#Regression with XG Boost

##Part 1 - Data Preprocessing

###Importing the dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('insurance.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

###Checking missing data

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


###Handling categorical variables

####Sex column

In [None]:
dataset['sex'] = dataset['sex'].apply(lambda x: 1 if x == 'male' else 0)

####Smoker Column

In [None]:
dataset['smoker'] = dataset['smoker'].apply(lambda x: 1 if x == 'yes' else 0)

####Region Column

In [None]:
region_dummies = pd.get_dummies(dataset['region'], drop_first=True, dtype=int)
dataset = pd.concat([region_dummies, dataset], axis=1)

In [None]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,region,charges
0,0,0,1,19,0,27.9,0,1,southwest,16884.924
1,0,1,0,18,1,33.77,1,0,southeast,1725.5523
2,0,1,0,28,1,33.0,3,0,southeast,4449.462
3,1,0,0,33,1,22.705,0,0,northwest,21984.47061
4,1,0,0,32,1,28.88,0,0,northwest,3866.8552


In [None]:
dataset.drop('region', axis=1, inplace=True)

In [None]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,charges
0,0,0,1,19,0,27.9,0,1,16884.924
1,0,1,0,18,1,33.77,1,0,1725.5523
2,0,1,0,28,1,33.0,3,0,4449.462
3,1,0,0,33,1,22.705,0,0,21984.47061
4,1,0,0,32,1,28.88,0,0,3866.8552


###Creating the Training Set and Test Set

####Getting the Inputs and Output

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
X

array([[ 0.  ,  0.  ,  1.  , ..., 27.9 ,  0.  ,  1.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.77,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.  ,  3.  ,  0.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 36.85,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , ..., 25.8 ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 29.07,  0.  ,  1.  ]])

####Splitting data into Training Set and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

##Part-2 Building and Training the model

###Building the model

In [None]:
import xgboost as xgb
regressor = xgb.XGBRegressor()

###Training the model

In [None]:
regressor.fit(X_train, y_train)

###Inference

In [None]:
predictions = regressor.predict(X_test)

In [None]:
predictions

array([10083.838  ,  7742.762  , 46423.664  , 11340.579  ,  9828.638  ,
        7766.7695 ,   837.2472 ,  9468.67   ,  7593.59   ,  6376.6772 ,
        6389.1016 , 20784.61   ,  8196.428  ,  9244.154  , 28719.697  ,
       16030.136  , 12336.959  ,  8097.138  ,  6219.824  , 33011.016  ,
       25374.     , 13021.097  , 11138.516  , 29063.44   ,  5241.3086 ,
        5197.8125 ,  6877.024  ,  8496.295  ,  4144.6924 , 10015.438  ,
        8257.027  , 48526.316  , 19514.002  , 12765.732  , 18038.166  ,
        3515.298  ,  8126.107  , 35665.79   , 39093.48   ,  1024.8882 ,
        8515.383  ,  4260.4644 , 18617.865  , 49345.523  , 35555.492  ,
        4517.984  , 16016.47   ,  6886.886  ,  5700.9927 , 11950.2    ,
        3097.7454 , 11891.595  , 30634.62   , 48160.06   , 11803.823  ,
       11499.489  , 10306.721  , 10285.909  , 11667.745  , 17062.516  ,
        1433.8784 , 43994.438  , 17618.398  , 13699.92   , 17367.059  ,
        8376.721  , 37391.082  , 40199.066  ,  6924.9556 , 14381