<a href="https://colab.research.google.com/github/BirukZenebe1/Regression-models-xgBoost-lightGBM-catBoost-/blob/main/xgboost_regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost Regressor

## Part 1 - Data Preprocessing

### Importing the dataset

In [None]:
import pandas as pd
dataset=pd.read_csv('insurance.csv')

In [None]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Checking missing data

In [None]:
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Handling categorical variables

Sex column

In [None]:
dataset['sex']=dataset['sex'].apply(lambda x: 0 if x == 'female' else 1)
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,yes,southwest,16884.92400
1,18,1,33.770,1,no,southeast,1725.55230
2,28,1,33.000,3,no,southeast,4449.46200
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,no,northwest,10600.54830
1334,18,0,31.920,0,no,northeast,2205.98080
1335,18,0,36.850,0,no,southeast,1629.83350
1336,21,0,25.800,0,no,southwest,2007.94500


Smoker column


In [None]:
dataset['smoker']=dataset['smoker'].apply(lambda x: 0 if x == 'no' else 1)
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


Region column

In [None]:
dataset['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [None]:
region_dummies = pd.get_dummies(dataset['region'],drop_first=True, dtype=int)
region_dummies


Unnamed: 0,northwest,southeast,southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
1333,1,0,0
1334,0,0,0
1335,0,1,0
1336,0,0,1


In [None]:
dataset= pd.concat([dataset,region_dummies],axis=1)
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northwest,southeast,southwest
0,19,0,27.900,0,1,southwest,16884.92400,0,0,1
1,18,1,33.770,1,0,southeast,1725.55230,0,1,0
2,28,1,33.000,3,0,southeast,4449.46200,0,1,0
3,33,1,22.705,0,0,northwest,21984.47061,1,0,0
4,32,1,28.880,0,0,northwest,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest,10600.54830,1,0,0
1334,18,0,31.920,0,0,northeast,2205.98080,0,0,0
1335,18,0,36.850,0,0,southeast,1629.83350,0,1,0
1336,21,0,25.800,0,0,southwest,2007.94500,0,0,1


In [None]:
dataset=dataset.drop(['region'],axis=1)

In [None]:
dataset.head()


Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,0,27.9,0,1,16884.924,0,0,1
1,18,1,33.77,1,0,1725.5523,0,1,0
2,28,1,33.0,3,0,4449.462,0,1,0
3,33,1,22.705,0,0,21984.47061,1,0,0
4,32,1,28.88,0,0,3866.8552,1,0,0


### Creating the Training Set and the Test Set

Getting the inputs and output

In [None]:
X=dataset.drop(['charges'],axis=1)
y=dataset['charges']

In [None]:
y

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


Getting the Training Set and the Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 0)

## Part 2 - Building and training the model

### Building the model

In [None]:
from xgboost import XGBRegressor
model=XGBRegressor(max_depth = 2, learning_rate = 0.1, n_estimators = 150)

### Training the model

In [None]:
model.fit(X_train,y_train)

### Inference

In [None]:
y_test

Unnamed: 0,charges
578,9724.53000
610,8547.69130
569,45702.02235
1034,12950.07120
198,9644.25250
...,...
1084,15019.76005
726,6664.68595
1132,20709.02034
725,40932.42950


In [None]:
y_pred=model.predict(X_test)
y_pred

array([12197.166 ,  9962.2295, 46630.34  , 14509.742 , 12136.312 ,
        4054.3943,  2492.829 , 13114.509 ,  8589.051 ,  7178.061 ,
        6481.2065, 11770.787 ,  9161.926 ,  5561.6597, 19983.664 ,
       11565.331 , 13820.864 ,  5827.3755,  7852.7314, 35526.055 ,
       25208.94  , 14246.345 , 12529.169 , 25137.383 ,  2980.6719,
        7236.3413,  3450.8743,  8140.1006,  4772.8345, 11260.549 ,
        8229.179 , 47979.195 , 14645.729 , 12094.871 , 16588.902 ,
        5052.079 , 12405.351 , 37973.37  , 38941.797 ,  2568.9922,
        4309.054 ,  4376.4336, 20536.    , 46474.285 , 36976.957 ,
        5778.588 , 11565.331 ,  7249.85  ,  5276.7515, 12830.169 ,
        5252.1196,  4947.262 , 25792.24  , 45012.996 , 11492.92  ,
        5958.2803,  4519.926 , 10366.663 , 10079.341 , 15345.363 ,
        2279.7495, 46162.62  , 16523.041 , 11243.455 , 13836.873 ,
        9813.6   , 34951.53  , 39625.016 ,  4455.095 , 10205.354 ,
       14559.453 , 12407.2705, 18800.668 , 14937.014 , 13965.8

## Part 3: Evaluating the model

### R-Squared

In [None]:
from sklearn.metrics import r2_score
r2=r2_score(y_test, y_pred)
r2

0.9024549670958453

### Adjusted R-Squared

In [None]:
k=X_test.shape[1]
n=X_test.shape[0]
adj_r2 = 1 - (1 - r2) * (n - 1)/(n - k - 1)
adj_r2

0.8994419931065277

### k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
k=10
r2s = cross_val_score(estimator=model, X=X, y=y, scoring='r2',cv=k)
print("Average r2:",r2s.mean().round(3))
print("Standard deviation r2:",r2s.std().round(3))

Average r2: 0.86
Standard deviation r2: 0.044
