# Insurance Premium Prediction
This notebook implements a custom Linear Regression model to predict insurance premiums based on various customer features. The implementation includes L2 regularization for better generalization and uses gradient descent for optimization.

## Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression as SklearnLinearRegression
from sklearn.feature_selection import SelectKBest, f_regression

## Custom Linear Regression Implementation
Implementation of a custom Linear Regression model with L2 regularization using gradient descent optimization.

In [None]:
class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000, lambda_reg=0.1):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.lambda_reg = lambda_reg
        self.weights = None
        self.bias = None
        self.cost_history = []

    def fit(self, X, y, tolerance=1e-6):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        prev_cost = float('inf')
        
        for i in range(self.n_iterations):
            y_pred = self._predict(X)
            cost = (1/(2*n_samples)) * (
                np.sum((y_pred - y) ** 2) + 
                self.lambda_reg * np.sum(self.weights ** 2)
            )
            self.cost_history.append(cost)
            
            if abs(prev_cost - cost) < tolerance:
                print(f"Convergence reached at iteration {i}")
                break
            prev_cost = cost
            
            dw = (1/n_samples) * (
                np.dot(X.T, (y_pred - y)) + 
                self.lambda_reg * self.weights
            )
            db = (1/n_samples) * np.sum(y_pred - y)
            
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def _predict(self, X):
        return np.dot(X, self.weights) + self.bias

    def predict(self, X):
        return self._predict(X)

## Data Loading and Preprocessing
Load the insurance data and perform necessary preprocessing steps including:
- Feature extraction from dates
- Handling missing values
- One-hot encoding for categorical variables
- Feature selection and scaling

In [None]:
def load_and_preprocess_data():
    try:
        train_data = pd.read_csv('data/train.csv')
        test_data = pd.read_csv('data/test.csv')
    except FileNotFoundError as e:
        print("Error: Data file not found. Please check your file paths.")
        raise e
    
    X = train_data.drop(['Premium Amount', 'id'], axis=1)
    y = train_data['Premium Amount']
    
    X['Policy Start Date'] = pd.to_datetime(X['Policy Start Date'], errors='coerce')
    X['Start_Year'] = X['Policy Start Date'].dt.year
    X['Start_Month'] = X['Policy Start Date'].dt.month
    X['Start_Day'] = X['Policy Start Date'].dt.day
    X.drop('Policy Start Date', axis=1, inplace=True)
    
    numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = X.select_dtypes(include=['object']).columns
    
    for col in numeric_columns:
        X[col] = X[col].fillna(X[col].median())
    for col in categorical_columns:
        X[col] = X[col].fillna(X[col].mode()[0])
    
    categorical_features = ['Gender', 'Smoking Status', 'Location', 'Marital Status', 
                          'Education Level', 'Occupation', 'Policy Type', 
                          'Property Type', 'Customer Feedback', 'Exercise Frequency']
    X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    selector = SelectKBest(score_func=f_regression, k=10)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_val_selected = selector.transform(X_val)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_val_scaled = scaler.transform(X_val_selected)
    
    return X_train_scaled, X_val_scaled, y_train, y_val, scaler

# Load and preprocess the data
X_train, X_val, y_train, y_val, scaler = load_and_preprocess_data()

## Model Training and Evaluation
Train both our custom linear regression model and scikit-learn's implementation for comparison.

In [None]:
# Train custom linear regression model
custom_model = LinearRegression(learning_rate=0.01, n_iterations=1000, lambda_reg=0.1)
custom_model.fit(X_train, y_train, tolerance=1e-6)

# Make predictions using the custom model
y_pred = custom_model.predict(X_val)

# Calculate evaluation metrics for the custom model
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print('Custom Model Performance:')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R2 Score: {r2:.2f}')

# Compare with scikit-learn's Linear Regression
sklearn_model = SklearnLinearRegression()
sklearn_model.fit(X_train, y_train)
sklearn_pred = sklearn_model.predict(X_val)
sklearn_r2 = r2_score(y_val, sklearn_pred)
print('Scikit-learn Model Performance:')
print(f'R2 Score: {sklearn_r2:.2f}')

## Visualization
Plot the cost history and prediction comparison.

In [None]:
# Plot cost history
plt.figure(figsize=(10, 6))
plt.plot(custom_model.cost_history, label='Cost')
plt.title('Cost Function over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.legend()
plt.grid(True)
plt.show()

# Plot predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_pred, alpha=0.5)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
plt.title('Actual vs Predicted Insurance Premiums')
plt.xlabel('Actual Premiums')
plt.ylabel('Predicted Premiums')
plt.grid(True)
plt.show()