# Imports and Installs

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.metrics import root_mean_squared_log_error
from sklearn.ensemble import HistGradientBoostingRegressor



#lets pandas display more columns from the dataset 
pd.set_option("display.max_columns", 50) 

# Data Loading

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
train_data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


# Data Preprocessing

## Data Cleaning

In [3]:
#train_data.loc[0:5,categorical_cols]

y_train = train_data.pop("Premium Amount") 
X_train = train_data
X_train = X_train.drop("id", axis = 1)
#can also use .drop to create a new table without changing the previous data

In [4]:
#Changing the "Policy Start Date" to just the year date and got rid of the Month, Day and time

X_train["Policy Start Date"] = pd.to_datetime(X_train["Policy Start Date"])
X_train["Policy Start Date"] = X_train["Policy Start Date"].dt.year
X_train["Policy Start Date"]
pd.to_numeric(X_train["Policy Start Date"])


0          2023
1          2023
2          2023
3          2024
4          2021
           ... 
1199995    2023
1199996    2022
1199997    2021
1199998    2021
1199999    2020
Name: Policy Start Date, Length: 1200000, dtype: int32

In [5]:
#shows all the columns with missing values 
cols_with_missing_vals = [ col for col in X_train if X_train[col].isna().any()]
cols_with_missing_vals


['Age',
 'Annual Income',
 'Marital Status',
 'Number of Dependents',
 'Occupation',
 'Health Score',
 'Previous Claims',
 'Vehicle Age',
 'Credit Score',
 'Insurance Duration',
 'Customer Feedback']

## Encoding

In [6]:
# Define the ordinal features with their respective orders
ordinal_features = ["Education Level", "Customer Feedback", "Exercise Frequency", "Policy Type"]
ordinal_categories = [
    ["High School", "Bachelor's", "Master's", "PhD"],  # Education Level
    ["Poor", "Average", "Good"],                      # Customer Feedback
    ["Rarely", "Monthly", "Weekly", "Daily"],          # Exercise Frequency
    ['Basic', 'Comprehensive', 'Premium']             # Policy Type
]

# Define nominal features
nominal_features = [
    "Gender", "Marital Status", "Occupation", 
    "Location", "Smoking Status", "Property Type"
]

In [7]:
#Encoded nominal features and set NaN vals to -1

#for feature in nominal_features: 
#    X_train[feature] = pd.Categorical(X_train[feature]).codes
#X_train

In [8]:
#Encoded Ordinal features and set NaN Vals to -1

def ordinal_encoder(df, ordinal_features, ordinal_categories):
    for feature in ordinal_features:
        df[feature] = pd.Categorical(df[feature], ordered=True, categories=ordinal_categories[ordinal_features.index(feature)]).codes
    return df

X_train = ordinal_encoder(X_train, ordinal_features, ordinal_categories)

#X_train["Education Level"] = pd.Categorical(X_train["Education Level"], ordered=True, categories= ordinal_categories[0]).codes

X_train

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,19.0,Female,10049.0,Married,1.0,1,Self-Employed,22.598761,Urban,2,2.0,17.0,372.0,5.0,2023,0,No,2,House
1,39.0,Female,31678.0,Divorced,3.0,2,,15.569731,Rural,1,1.0,12.0,694.0,2.0,2023,1,Yes,1,House
2,23.0,Male,25602.0,Divorced,3.0,0,Self-Employed,47.177549,Suburban,2,1.0,14.0,,3.0,2023,2,Yes,2,House
3,21.0,Male,141855.0,Married,2.0,1,,10.938144,Rural,0,1.0,0.0,367.0,1.0,2024,0,Yes,3,Apartment
4,21.0,Male,39651.0,Single,1.0,1,Self-Employed,20.376094,Rural,2,0.0,8.0,598.0,4.0,2021,0,Yes,2,House
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,36.0,Female,27316.0,Married,0.0,2,Unemployed,13.772907,Urban,2,,5.0,372.0,3.0,2023,0,No,3,Apartment
1199996,54.0,Male,35786.0,Divorced,,2,Self-Employed,11.483482,Rural,1,,10.0,597.0,4.0,2022,0,No,2,Apartment
1199997,19.0,Male,51884.0,Divorced,0.0,2,,14.724469,Suburban,0,0.0,19.0,,6.0,2021,2,No,1,Condo
1199998,55.0,Male,,Single,1.0,3,,18.547381,Suburban,2,1.0,7.0,407.0,4.0,2021,0,No,3,Apartment


# Model Handling

## Model Selection

In [9]:
categorical_features = nominal_features

In [10]:
#sample_size = 100000

#X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=sample_size)

X_train, x_val, y_train, y_val = train_test_split(X_train,y_train)

HGBR_model = HistGradientBoostingRegressor(categorical_features= categorical_features, random_state= 0)

HGBR_model.fit(X_train, y_train)

HGBR_pred = HGBR_model.predict(x_val)

## Model Evaluation

In [11]:
#validation Root Mean Squared Logarithmic Error
rmsle = root_mean_squared_log_error(y_val, HGBR_pred)
print("Root Mean Squared Logarithmic Error: ", rmsle)



Root Mean Squared Logarithmic Error:  1.132725671751376


In [12]:
#validation mean absolute error
mse = mean_absolute_error(y_val, HGBR_pred)
print(mse)


630.0456294126335


# Submission

In [13]:
#dropping test id column
test_data_id = test_data.pop("id")

In [14]:
#Changing the "Policy Start Date" of TEST DATA

test_data["Policy Start Date"] = pd.to_datetime(test_data["Policy Start Date"])
test_data["Policy Start Date"] = test_data["Policy Start Date"].dt.year
test_data["Policy Start Date"]
pd.to_numeric(test_data["Policy Start Date"])

test_data= ordinal_encoder(test_data, ordinal_features, ordinal_categories)

In [15]:
test_pred = HGBR_model.predict(test_data)

In [16]:
#Makes submission file
"""
submission = pd.DataFrame({'id': test_data_id, 'Premium Amount': test_pred})
submission"""

"\nsubmission = pd.DataFrame({'id': test_data_id, 'Premium Amount': test_pred})\nsubmission"

In [17]:
#submission.to_csv('submission.csv', index=False)