In [1]:
# Fix imports when running from notebooks/ folder
import sys
from pathlib import Path

project_root = Path.cwd().parent 
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")

%load_ext autoreload
%autoreload 2

Added to path: /Users/elshaday/DEV/10Academy/insurance-risk-analysis-week3


In [2]:
from src import DataManager, DataPreparer
from scripts.constants import Columns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd 

In [3]:
dm = DataManager()
clean_df = dm.load_csv(load_clean=True)

Basic Data Info:

<class 'pandas.core.frame.DataFrame'>
Index: 846034 entries, 0 to 1000097
Data columns (total 54 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   UnderwrittenCoverID          846034 non-null  int64         
 1   PolicyID                     846034 non-null  int64         
 2   TransactionMonth             846034 non-null  datetime64[ns]
 3   IsVATRegistered              846034 non-null  bool          
 4   Citizenship                  846034 non-null  object        
 5   LegalType                    846034 non-null  object        
 6   Title                        846034 non-null  object        
 7   Language                     846034 non-null  object        
 8   Bank                         846034 non-null  object        
 9   AccountType                  846034 non-null  object        
 10  MaritalStatus                846034 non-null  object        
 11  Gender      

### 1. Claim Severity Prediction

In [6]:
# Step 1 Data Preparation

data_preparer = DataPreparer()
X_severity, y_log_severity, preprocessor_lr, severity_df = (
    data_preparer.prepare_for_linear_regression(
        df=clean_df, target_col=Columns.TotalClaims.value
    )
)

Skewness of TotalClaims: 3.8463379940832976
Transformed TotalClaims to Log: TotalClaims_log


In [7]:
severity_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2437 entries, 284 to 999701
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Province                  2437 non-null   object 
 1   Gender                    2437 non-null   object 
 2   make                      2437 non-null   object 
 3   Model                     2437 non-null   object 
 4   RegistrationYear          2437 non-null   int64  
 5   Cylinders                 2437 non-null   float64
 6   cubiccapacity             2437 non-null   float64
 7   kilowatts                 2437 non-null   float64
 8   NumberOfDoors             2437 non-null   float64
 9   CustomValueEstimate       2437 non-null   float64
 10  SumInsured                2437 non-null   float64
 11  CalculatedPremiumPerTerm  2437 non-null   float64
 12  TotalPremium              2437 non-null   float64
 13  TotalClaims               2437 non-null   float64
 14  TotalClai

In [8]:
# Step 2: Train/Test Data Split

X_train_severity, X_test_severity, y_train_log_severity, y_test_log_severity = train_test_split(
    X_severity, y_log_severity, test_size=0.3, random_state=42
)

A. Linear Regression Training

In [9]:
# Step 3: Prepare data for LR and initiate Model
pipeline_lr = Pipeline([
    ("preprocessor", preprocessor_lr),
    ("regressor", LinearRegression())
])

# Step 4: Fit model using training data
y_test_severity = np.exp(y_test_log_severity)
pipeline_lr.fit(X=X_train_severity, y=y_train_log_severity)

# Step 5: Predict target variable using testing data
y_pred_log_severity = pipeline_lr.predict(X_test_severity)
y_pred_severity = np.exp(y_pred_log_severity)



B. Model Evaluation

In [12]:
# Step 6: Evaluation
rmse = np.sqrt(mean_squared_error(y_test_severity, y_pred_severity))
mae = mean_absolute_error(y_test_severity, y_pred_severity)
r2 = r2_score(y_test_severity, y_pred_severity)

print(
    f"\nLinear Regression Model Evaluation:\nRMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.3f}"
)

# Step 7
results_df = pd.DataFrame({"Actual": y_test_severity, "Predicted": y_pred_severity})
print(f"\nActual vs Predicted:\n {results_df.head(10)}")


Linear Regression Model Evaluation:
RMSE: 52511.65, MAE: 17229.91, R2: -0.926

Actual vs Predicted:
                Actual     Predicted
659185    6140.350877   2506.130911
339580    6140.350877   3411.777030
221314     850.000000  12620.953357
887626   79578.017544  12969.004656
419806  247735.964912  30647.251031
168321     721.166667   1456.975583
86843      750.649123   1347.670692
696932   56913.052632  24054.543999
86888      750.877193   1251.146820
668153   23151.920351  28192.072433


Linear regression seem to do really bad here!

In [None]:
# Claim Probability Prediction

probability_subset_df = clean_df.copy()

X = probability_subset_df.drop([Columns.TotalClaims.value], axis=1)
y = probability_subset_df[Columns.TotalClaims.value]

X_train_porbability, X_test_porbability, y_train_porbability, y_test_porbability = (
    train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
)