In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


# 1. Data Preparation: 

In [16]:
# Load dataset
df = pd.read_csv('../data/dataset.csv')

  df = pd.read_csv('../data/dataset.csv')


**Handling Missing Data:**
 Impute or remove missing values based on their nature and the quantity missing.

In [17]:
# Handle Missing Data (Drop rows with missing values or apply imputation as needed)
df = df.dropna() 

**Feature Engineering:**
 Create new features that might be relevant to TotalPremium and TotalClaims.


In [18]:
# Feature Engineering: Create new features based on domain knowledge (e.g., 'Claim Ratio')
df['ClaimRatio'] = df['TotalClaims'] / (df['TotalPremium'] + 1)  # Add 1 to avoid division by zero

**Encoding Categorical Data:**
 Convert categorical data into a numeric format using one-hot encoding or label encoding to make it suitable for modeling.

In [19]:
# Encoding Categorical Data
# Using Label Encoding for binary categorical columns (Gender, LegalType)
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['LegalType'] = le.fit_transform(df['LegalType'])

In [20]:
# Using One-Hot Encoding for multi-class categorical columns (Province, PostalCode, etc.)
df = pd.get_dummies(df, columns=['Province', 'PostalCode', 'VehicleType'], drop_first=True)


In [21]:
# Select features and target variable (e.g., predicting TotalPremium or TotalClaims)
X = df.drop(columns=['TotalPremium', 'TotalClaims'])  # Features
y = df['TotalPremium']  # Target variable (or 'TotalClaims')

In [34]:
# Check shapes again
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Train-Test Split (80:20 ratio)
if not X.empty and not y.empty:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
else:
    print("X or y is empty. Please check your data preparation.")

Shape of X: (0, 48)
Shape of y: (0,)
X or y is empty. Please check your data preparation.


**Train-Test Split:**
 Divide the data into a training set (for building the model) and a test set (for validating the model), typically using a 70:30 or 80:20 ratio.

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Define your features and target variable
X = df.drop(columns=['TotalPremium', 'TotalClaims'], errors='ignore')  # Features
y = df['TotalPremium']  # Target variable

# Check for missing values in X and y
if X.isnull().values.any():
    print("Missing values found in features. Handling them...")
    # Option 1: Fill missing values with the mean (for numerical features)
    X = X.fillna(X.mean())
    # Option 2: Drop rows with missing values (if appropriate)
    # X = X.dropna()

if y.isnull().any():
    print("Missing values found in target variable. Handling them...")
    # Fill missing values in the target variable
    y = y.fillna(y.mean())  # Or you can choose to drop rows where y is NaN
    # y = y.dropna()

# Check if the data is empty after handling missing values
if X.empty or y.empty:
    print("X or y is empty after handling missing values. Please check your data preparation.")
else:
    # Train-Test Split (80:20 ratio)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Train-Test split successful.")
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)


X or y is empty after handling missing values. Please check your data preparation.


# 2. Model Building and Evaluation

2.1 Linear Regression


In [38]:
# Define your features and target variable
X = df.drop(columns=['TotalPremium', 'TotalClaims'], errors='ignore')  # Features
y = df['TotalPremium']  # Target variable

# Check for missing values in X and y
if X.isnull().values.any():
    print("Missing values found in features. Handling them...")
    # Fill missing values with the mean for numerical features
    X = X.fillna(X.mean())

if y.isnull().any():
    print("Missing values found in target variable. Handling them...")
    # Fill missing values in the target variable
    y = y.fillna(y.mean()) 

# Check if the data is empty after handling missing values
if X.empty or y.empty:
    print("X or y is empty after handling missing values. Please check your data preparation.")
else:
    # Train-Test Split (80:20 ratio)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Linear Regression Model
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    lr_predictions = lr_model.predict(X_test)

    # Evaluation
    mse = mean_squared_error(y_test, lr_predictions)
    r2 = r2_score(y_test, lr_predictions)
    print("Linear Regression MSE:", mse)
    print("Linear Regression R²:", r2)

    # Random Forest Model
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)
    rf_predictions = rf_model.predict(X_test)

    # Evaluation
    rf_mse = mean_squared_error(y_test, rf_predictions)
    rf_r2 = r2_score(y_test, rf_predictions)
    print("Random Forest MSE:", rf_mse)
    print("Random Forest R²:", rf_r2)

X or y is empty after handling missing values. Please check your data preparation.


# 2.2 Model Building
Implement Linear Regression, Random Forests, and XGBoost models


In [None]:
from xgboost import XGBRegressor

In [None]:
# 2.3 XGBoost
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)


In [None]:
# 3. Model Evaluation
def evaluate_model(true, pred, model_name):
    mse = mean_squared_error(true, pred)
    r2 = r2_score(true, pred)
    print(f'{model_name} - Mean Squared Error: {mse:.2f}, R2 Score: {r2:.2f}')

evaluate_model(y_test, lr_predictions, "Linear Regression")
evaluate_model(y_test, rf_predictions, "Random Forest")
evaluate_model(y_test, xgb_predictions, "XGBoost")

In [39]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

NameError: name 'X_train' is not defined

In [None]:
import shap