In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import sys
sys.path.append('../src')
from load_data import data_loader
from statistical_modeling import *

In [2]:
file_path = '../data/weekthree.csv'
df = data_loader(file_path)

  data = pd.read_csv(file_path)


In [3]:
# Cleaning the data
# drop columns with more than 5 missing values
df = df.loc[:, df.isnull().sum() <= 5]
# Drop rows with any NaN values
df = df.dropna(axis=0, how='any')
# Remove duplicates
df.drop_duplicates(inplace=True)

In [4]:
df.isnull().sum().sum()

np.int64(0)

In [5]:
# Standardize numerical features
scaler = StandardScaler()
df[['TotalPremium', 'TotalClaims']] = scaler.fit_transform(df[['TotalPremium', 'TotalClaims']])

In [6]:
df.isnull().sum().sum()

np.int64(0)

In [7]:
# Identify all categorical columns (dtype 'object')
categorical_cols = df.select_dtypes(include=['object']).columns

# Convert all categorical variables to numeric using One-Hot Encoding
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [8]:
# Define the target variable
y = df['TotalPremium']

# Define features (exclude the target variable and any non-feature columns)
X = df.drop(columns=['TotalPremium'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sample the training data for faster processing
X_train_sample = X_train.sample(n=10000, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

# The original test set is 20% of the data, we will take 20% of the original test set
test_sample_size = 2000
X_test_sample = X_test.sample(n=test_sample_size, random_state=42)
y_test_sample = y_test.loc[X_test_sample.index]

# Check the sizes to confirm the ratios
print(f"Original Data Size: {len(df)}")
print(f"Sampled Training Set Size: {len(X_train_sample)}")  # Should be 10000
print(f"Sampled Test Set Size: {len(X_test_sample)}")  # Should be 2000

Original Data Size: 998652
Sampled Training Set Size: 10000
Sampled Test Set Size: 2000


I tried to work using the whole dataset but it needs a large memory so I used sample training and testing datas


In [9]:
# Train the model
lr_model, lr_mse = linear_regression(X_train_sample, X_test_sample, y_train_sample, y_test_sample)

# Print the Mean Squared Error
print(f"Linear Regression MSE: {lr_mse}")

Linear Regression MSE: 9.500976616544357


In [10]:
# Train the model
rf_model, rf_mse = random_forest(X_train_sample, X_test_sample, y_train_sample, y_test_sample)

# Print the rf Mean Squared Error
print(f"Random Forest MSE: {rf_mse}")

Random Forest MSE: 0.23318444707650912


In [None]:
# Train the model
xgb_model, xgb_mse = xgboost_model(X_train_sample, X_test_sample, y_train_sample, y_test_sample)

# Print the xgb Mean Squared Error
print(f"XGBoost MSE: {xgb_mse}")