<a href="https://www.kaggle.com/code/danuherath/insurance-premium-predict-regression?scriptVersionId=215597206" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<h1 align="center"> Regression with an Insurance Dataset (Regression)</h1>

<img
    src="https://www.kaggle.com/competitions/84896/images/header"
    alt=""
    width="300"
    height="200"
    style="display: block; margin: 0 auto; border-radius:15px"
/>

---

## Problem Definition

- Domain

    * Insurance | Finance

<br>

- Dataset
    * [Regression with an Insurance Dataset](https://www.kaggle.com/competitions/playground-series-s4e12/data) dataset from Kaggle which contains 19 features explaining an individaul's demographics, professional and educational background and insurance payments related information. Train dataset contains 1,200,0000 samples and each instance represents one person.

<br>

- Objective
    * The goal of this project is to predict insurance premiums based on various factors.

<br>

- Algorithms
    * Following regressiion algorithms are used to train models on the train dataset. The models are evaluated using the [Root Mean Squared Logarithmic Error (RMSLE)](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.root_mean_squared_log_error.html) metric.

    1. [XGBoost (Extreme Gradient Boosting)Regressor](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor)
    2. [LightGBMRegressor](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html)
    3. [CatBoostRegressor](https://catboost.ai/docs/en/concepts/python-reference_catboostregressor)

<br>


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
%config InlineBackend.figure_format="svg"

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import optuna

RSEED = 42


In [None]:
train_data = pd.read_csv("/kaggle/input/playground-series-s4e12/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e12/test.csv")


In [None]:
train_data.head()


In [None]:
test_data.head()


In [None]:
train_data.columns


In [None]:
train_data.drop(columns=["id"], inplace=True)
test_data.drop(columns=["id"], inplace=True)


---

# Step 1: Exploratory Data Analysis (EDA)

---

In [None]:
print(f"Train Data\nInstance Count: {train_data.shape[0]} \nFeature Count: {train_data.shape[1]-1}")

train_data.head().style.set_table_attributes(
    'style="overflow-x: auto; display: inline-block;"'
    ).set_properties(**{'min-width': '50px'})


In [None]:
print(f"Test Data\nInstance Count: {test_data.shape[0]} \nFeature Count: {test_data.shape[1]}")

test_data.head().style.set_table_attributes(
    'style="overflow-x: auto; display: inline-block;"'
    ).set_properties(**{'min-width': '50px'})


In [None]:
train_data.info()


In [None]:
missing_values = train_data.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
print(missing_values)


In [None]:
duplicates = train_data[train_data.duplicated(keep=False)]
print(len(duplicates))
# print(duplicates)


In [None]:
train_data.nunique().sort_values(ascending=False)


In [None]:
cat_features = train_data.select_dtypes(include=["object"]).columns.tolist()
num_features = train_data.select_dtypes(include=[np.number]).columns.tolist()

print(f"Categorical columns:\n{cat_features}")
print(f"\nNumerical columns:\n{num_features}")


In [None]:
num_features.remove('Premium Amount')
target = 'Premium Amount'


In [None]:
cat_features.remove('Policy Start Date')


In [None]:
for col in cat_features:
    print("Feature:", col)
    print("Missing Value Count:", train_data[col].isnull().sum())
    print(dict(train_data[col].value_counts()), end='\n\n')


In [None]:
train_data[target].describe()


---

## 1.1 Statistical Analysis

---

In [None]:
train_data[num_features].describe().style.set_table_attributes(
    'style="overflow-x: auto; display: inline-block;"').set_properties(**{'min-width': '100px'})


In [None]:
Q1 = train_data[num_features].quantile(0.25)
Q3 = train_data[num_features].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_iqr = ((train_data[num_features] < lower_bound) | (train_data[num_features] > upper_bound))

outliers_count = outliers_iqr.sum()
outliers_count = outliers_count[outliers_count > 0].sort_values(ascending=False)

print(f"Outliers Count: \n{outliers_count}")


---

## 1.2 Data Visualization

---

---

## 1.2.1 Univariate Analysis¶

---

In [None]:
len(cat_features)


In [None]:
n_cols_cat = 2
n_rows_cat = 5


In [None]:
fig, axes = plt.subplots(n_rows_cat, n_cols_cat, figsize=(10, 4*n_rows_cat))
axes = axes.flatten()

for i, column in enumerate(train_data[cat_features].columns):
    ax = axes[i]
    category_counts = train_data[cat_features][column].value_counts()
    sns.barplot(x=category_counts.index, y=category_counts.values, ax=ax, palette="crest")
    ax.set_xticklabels(category_counts.index, rotation=45)

plt.suptitle("Bar Charts - Insurance  Dataset", fontsize=20, y=1.0)

plt.tight_layout()
plt.savefig("Categorical Feature Analysis - Bar Charts.svg")
plt.show()


In [None]:
fig, axes = plt.subplots(n_rows_cat, n_cols_cat, figsize=(10, 5*n_rows_cat))
axes = axes.flatten()

for i, column in enumerate(train_data[cat_features].columns):
    ax = axes[i]
    category_counts = train_data[cat_features][column].value_counts()
    ax.pie(
        category_counts.values,
        labels=category_counts.index,
        autopct='%1.1f%%',
        startangle=45,
        colors=sns.color_palette('crest'),
        labeldistance=1.1,
        pctdistance=0.80
    )
    ax.set_title(column)

plt.suptitle('Pie Charts - Insurance Dataset', fontsize=20, y=1.0)

plt.tight_layout()
plt.savefig("Categorical Feature Analysis - Pie Charts.svg")
plt.show()


In [None]:
len(num_features)


In [None]:
n_cols_num = 2
n_rows_num = 4


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

fig, axes = plt.subplots(n_rows_num, n_cols_num, figsize=(10, 3*n_rows_num))
axes = axes.flatten()

for i, col in enumerate(train_data[num_features].columns):
    ax = axes[i]
    sns.histplot(train_data[num_features][col], ax=ax, color='seagreen', stat='frequency', bins=20, kde=True)
    ax.set_ylabel('Frequency')

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.suptitle('Histograms - Insurance Dataset', fontsize=20, y=1.0)

plt.tight_layout()
plt.savefig('Numerical Feature Analysis - Histograms.svg')
plt.show()


In [None]:
fig, axes = plt.subplots(n_rows_num, n_cols_num, figsize=(10, 2*n_rows_num))
axes = axes.flatten()

for i, col in enumerate(train_data[num_features].columns):
    ax = axes[i]
    sns.boxplot(x=train_data[num_features][col], ax=ax, color='seagreen')
    ax.set_xlabel(col)
    ax.set_ylabel('Value')

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.suptitle('Box Plots - Insurance Dataset', fontsize=20, y=1.0)

plt.tight_layout()
plt.savefig('Numerical Feature Analysis - Box Plots.svg')
plt.show()


---

## 1.2.2 Bivariate Analysis

---

In [None]:
corr = train_data[num_features].corr()
cmap = sns.light_palette("darkgreen", as_cmap=True)

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap=cmap, linewidths=0.2)

plt.title('Correlation Matrix - Insurance Dataset', fontsize=20, y=1.1)

plt.tight_layout()
plt.savefig('Numerical Features Correlation Analysis - Corr Matrix.svg')
plt.show()


---

# Step 2: Data Preprocessing

---

In [None]:
train_data['Policy Start Date'] = pd.to_datetime(train_data['Policy Start Date'])
train_data['Policy Start Date']


In [None]:
# cat_features
# num_features

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_features),

        ('categorical', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore')),
        ]), cat_features)
    ],
    remainder='passthrough'
)

preprocessor


In [None]:
X = train_data.drop(columns=[target, 'Policy Start Date'])
y = train_data[target]
X.shape, y.shape


In [None]:
X.head()


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=RSEED)


---

# Step 3: Model Training & Evaluation

---

In [None]:

def root_mean_squared_log_error(y_true, y_pred):
    
    log_true = np.log1p(y_true)
    log_pred = np.log1p(y_pred)
    
    squared_diff = (log_true - log_pred) ** 2
    
    return np.sqrt(np.mean(squared_diff))


In [None]:
# models = {
#     'regression_model': LinearRegression(),    # 1.1695
#     'XGBoost': XGBRegressor(random_state=RSEED),    # 1.1495
#     'LGBM': LGBMRegressor(random_state=RSEED),   # 1.1495
#     'CatBoost': CatBoostRegressor(random_state=RSEED)    # 1.1487
# }


In [None]:

# def train_models(X_train, X_val, y_train, y_val, models, preprocessor):
#     results = {}
#     pipelines = {}
    
#     for name, model in models.items():
#         pipeline = make_pipeline(preprocessor, model)

#         pipeline.fit(X_train, y_train)
    
#         y_pred = pipeline.predict(X_val)
        
#         score = root_mean_squared_log_error(y_val, y_pred)
        
#         print(name, score)
        
#         results[name] = score
#         pipelines[name] = pipeline
                
#     return pipelines, results


# pipelines, scores = train_models(X_train, X_val, y_train, y_val, models, preprocessor)


In [None]:

# def objective(trial, X_train, X_val, y_train, y_val, preprocessor):
#     param = {
#         'objective': 'RMSE',
#         'eval_metric': 'RMSE',
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
#         'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
#         'min_child_samples': trial.suggest_int('min_child_samples', 1, 10),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 0.1),
#         'n_estimators': 300,
#         'random_state': 42,
#         'thread_count': -1
#     }
    
#     model = CatBoostRegressor(**param)
#     pipeline = make_pipeline(preprocessor, model)
#     pipeline.fit(X_train, y_train)
    
#     y_pred = pipeline.predict(X_val)
#     score = mean_squared_error(y_val, y_pred)
    
#     return score


# def tune_catboost(X_train, X_val, y_train, y_val, preprocessor, n_trials=30):
#     study = optuna.create_study(direction='minimize')
#     study.optimize(lambda trial: objective(trial, X_train, X_val, y_train, y_val, preprocessor), n_trials=n_trials)
    
#     print(f'Best trial: {study.best_trial.params}')

#     best_model_params = study.best_trial.params

#     return best_model_params


# best_model_params = tune_catboost(X_train, X_val, y_train, y_val, preprocessor)
# best_model_params


In [None]:
best_model_params = {
    'objective': 'RMSE',
    'eval_metric': 'RMSE',
    'max_depth': 10,
    'subsample': 0.9634787862433333,
    'colsample_bylevel': 0.9158885547943915,
    'min_child_samples': 4,
    'learning_rate': 0.05713686350188332,
    'n_estimators': 300,
    'random_state': 42,
    'thread_count': -1
}

best_model = CatBoostRegressor(**best_model_params)

best_catboost_model = make_pipeline(preprocessor, best_model)
best_catboost_model.fit(X_train, y_train)

y_pred = best_catboost_model.predict(X_val)
score = root_mean_squared_log_error(y_val, y_pred)
score


---

# Step 4: Test Data Prediction

---

In [None]:
test_data_predictions = best_catboost_model.predict(test_data)
test_data_predictions

In [None]:
submission = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')
submission[target] = test_data_predictions

submission.to_csv('submission.csv', index=False)
submission.head()
