In [1]:
import pandas as pd
from scipy import stats
from scipy.stats import f_oneway
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [2]:
df = pd.read_csv('C:/Users/teeyob/Car_Insurance_Risk_Premium_Optimization/data/cleaned_data.csv', low_memory=False)


In [4]:
target = 'TotalPremium'
features = [
    'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
    'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
    'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
    'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
    'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
    'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
    'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
    'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet',
    'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm',
    'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
    'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalClaims'
]


In [5]:

df = df.dropna(subset=[target])


df = df.dropna(subset=features)

X = df[features]
y = df[target]





In [6]:

numeric_features = [
    'RegistrationYear', 'Cylinders', 'cubiccapacity', 'kilowatts', 'NumberOfDoors',
    'CustomValueEstimate', 'CapitalOutstanding', 'NumberOfVehiclesInFleet', 'SumInsured',
    'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'TotalClaims'
]

categorical_features = [
    'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank',
    'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode',
    'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'make',
    'Model', 'bodytype', 'VehicleIntroDate', 'AlarmImmobiliser', 'TrackingDevice',
    'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'CoverCategory',
    'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)


LinearRegression

In [6]:

categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', LinearRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")




Mean Squared Error: 14993.029087703813
R^2 Score: 0.45654155937646723


RandomForestRegressor

In [9]:

categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

XGBRegressor

In [8]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)


Mean Squared Error: 10070.250430220149
R^2 Score: 0.6349795252525547


DecisionTreeRegressor

In [10]:

categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', DecisionTreeRegressor())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")



Mean Squared Error: 5178.4757797858065
R^2 Score: 0.8122936762393664
