# XGBoost Regressor

## Part 1 - Data Preprocessing

### Importing the dataset

In [48]:
import pandas as pd
dataset = pd.read_csv('insurance.csv')

In [None]:
dataset.head()

In [None]:
# The dataset contains the following features:
# - age: Age of the primary beneficiary
# - sex: Gender of the beneficiary (0 for female, 1 for male)
# - bmi: Body mass index, providing an understanding of body fat based on height and weight
# - children: Number of children covered by health insurance
# - smoker: Smoking status of the beneficiary (0 for non-smoker, 1 for smoker)
# - region: The beneficiary's residential area in the US, categorized into four regions (southwest, southeast, northwest, northeast)
# - charges: The medical costs billed to health insurance

# Our goal is to predict the 'charges' feature, which represents the medical costs billed to health insurance, using the other features in the dataset.


### Checking missing data

In [None]:
dataset.info()

### Handling categorical variables

Sex column

In [None]:
dataset['sex'].unique()

In [52]:
dataset['sex'] = dataset['sex'].apply(lambda x: 0 if x == 'female' else 1)

In [None]:
dataset.head()

Smoker column

In [None]:
dataset['smoker'].unique()

In [55]:
dataset['smoker'] = dataset['smoker'].apply(lambda x: 0 if x == 'no' else 1)

In [None]:
dataset.head()

Region column

In [None]:
dataset['region'].unique()

In [58]:
region_dummies = pd.get_dummies(dataset['region'], drop_first = True)

In [None]:
region_dummies

In [60]:
dataset = pd.concat([region_dummies, dataset], axis = 1)

In [None]:
dataset.head()

In [62]:
dataset.drop(['region'], axis = 1, inplace = True)

In [None]:
dataset.head()

### Creating the Training Set and the Test Set

Getting the inputs and output

In [64]:
X = dataset.iloc[:, :-1]

In [65]:
y = dataset.iloc[:, -1]

In [None]:
X

In [None]:
y

Getting the Training Set and the Test Set

In [68]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Part 2 - Building and training the model

### Building the model

In [69]:
# pip install xgboost

In [70]:
import xgboost

# Explanation of XGBRegressor parameters:
# - max_depth: This parameter specifies the maximum depth of a tree. Increasing this value makes the model more complex and more likely to overfit.
# - learning_rate: Also known as eta, this parameter controls the step size at each iteration while moving toward a minimum of the loss function. Lower values make the model more robust to overfitting but require more boosting rounds.
# - n_estimators: This parameter determines the number of trees in the model. More trees can improve the model's performance but also increase the risk of overfitting.
model = xgboost.XGBRegressor(max_depth = 2, learning_rate = 0.1, n_estimators = 100)

### Training the model

In [None]:
model.fit(X_train, y_train)

### Inference

In [72]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

## Part 3: Evaluating the model

### R-Squared

In [75]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)

In [None]:
r2

### Adjusted R-Squared

In [77]:
k = X_test.shape[1]
n = X_test.shape[0]
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

In [None]:
adj_r2

### k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
r2s = cross_val_score(estimator = model,
                      X = X,
                      y = y,
                      scoring = 'r2',
                      cv = 10)
print(f"Average R-Squared: {r2s.mean()}")
print(f"Standard Deviation: {r2s.mean()}")

## Feature Importance

In [None]:
xgboost.plot_importance(model, importance_type='gain', xlabel='Importance score', ylabel='Features', title='Feature importance')

In [82]:
X_reduced = X.drop(columns=['children', 'northwest', 'southwest', 'southeast'])


In [None]:
X_train_reduced = X_train.drop(columns=['children', 'northwest', 'southwest', 'southeast'])
X_test_reduced = X_test.drop(columns=['children', 'northwest', 'southwest', 'southeast'])
y_train_reduced = y_train  # Assuming no columns need to be dropped from y_train

model.fit(X_train_reduced, y_train)
y_pred_reduced = model.predict(X_test_reduced)

from sklearn.metrics import mean_squared_error, r2_score
print("RMSE:", mean_squared_error(y_test, y_pred_reduced))
print("R^2:", r2_score(y_test, y_pred))

In [91]:
k = X_test_reduced.shape[1]
n = X_test_reduced.shape[0]
r2_reduced = r2_score(y_test, y_pred_reduced)
adj_r2 = 1-(1-r2_reduced)*(n-1)/(n-k-1)

In [None]:
adj_r2