In [35]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/insurance/insurance.csv


In [36]:
from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [37]:
df= pd.read_csv('/kaggle/input/insurance/insurance.csv')

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [39]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [40]:
df.shape

(1338, 7)

In [41]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


### Dropping duplicate values

In [42]:
#Handle Duplicate values
duplicate_rows = df[df.duplicated()]
print("no of duplicated rows:", duplicate_rows.shape)

no of duplicated rows: (1, 7)


In [43]:
df = df.drop_duplicates()

## Uniqueness

### Checking each column and counting the number of distinct values present 

In [44]:
# Check each column and count the number of distinct values present 
for column in df.columns:
    num_distinct_values = len(df[column].unique())
    print(f"{column}: {num_distinct_values} distinct values")

age: 47 distinct values
sex: 2 distinct values
bmi: 548 distinct values
children: 6 distinct values
smoker: 2 distinct values
region: 4 distinct values
charges: 1337 distinct values


### No of missing values

In [45]:
#missing values
print(df.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


# Encoding

###  Label Encoding is a technique used in machine learning and data preprocess to convert the categorical values into numerical values it assign a unique numerical label to each unique distinct category

In [46]:
#Apply label encoding to the categorical values
le = LabelEncoder()
df["sex"] = le.fit_transform(df["sex"])
df["smoker"] = le.fit_transform(df["smoker"])
df["region"] = le.fit_transform(df["region"])

In [47]:
# Split data into features (X) and target (y)
X = df.drop(columns=['charges'])
y = df['charges']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
## Model trained using GradientBoostingRegressor

In [48]:
# Initialize the model
model = GradientBoostingRegressor(random_state=42)

# Fit the model
model.fit(X_train, y_train)

### Initialize k crossvalidation
#### Cross-Validation Score (Cross-Val Score) is a metric used to evaluate the performance of a model on unseen data

In [49]:
# Perform 10-fold CV
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
print(f"Cross-validated MSE: {-np.mean(scores)}")

Cross-validated MSE: 22539179.595214732


### Defining the hyperparameter 

In [52]:
# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.02, 0.025, 0.05, 0.1, 1],
    'max_depth': [3, 4, 5,]
}

## Using GridSearchCV 
### GridSearchCV is a powerful tool for hyperparameter tuning in machine learning and can be used to find the best set of hyperparameters for a given model and dataset

In [53]:
# Initialize the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Fitting 10 folds for each of 90 candidates, totalling 900 fits
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 400}


In [54]:
# Update the model with the best parameters
model = grid_search.best_estimator_

In [55]:
# Make predictions
y_pred = model.predict(X_test)

### Calculate the MAE , MSE ,RMSE,R2

In [56]:
# Calculate MAE, MSE, RMSE, R2
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(0.5)  
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)

MAE: 2511.815199736201
MSE: 18284244.516552158
RMSE: 4276.00801175023
R2 Score: 0.9004971911432395
