In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
import warnings
warnings.filterwarnings("ignore")


### Step 1: Understanding the Data

#### 1.1 Load and Explore the Dataset

In [None]:
train_df=pd.read_csv("../data/raw/train.csv")
test_df=pd.read_csv("../data/raw/test.csv")
sample_sumbission=pd.read_csv("../data/raw/sample_submission.csv")

In [None]:
print(train_df.shape)

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

#### 1.2 Perform Exploratory Data Analysis (EDA)


In [None]:
# Check how many rows (data points) and columns (features) exist.
print("Train Shape", train_df.shape)
print("Test Shape", test_df.shape)

In [None]:
# Identify missing values
missing_values=train_df.isnull().sum()
missing_values

In [None]:
# Understand distributions of numerical and categorical features.
numerical_cols=train_df.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_cols=train_df.select_dtypes(include=["object"]).columns.tolist()
numerical_cols.remove("Premium Amount")
print("Numerical Features",numerical_cols)
print("Categorical Features",categorical_cols)

In [None]:
# Use data visualizations to find relationships between features.
# Distributions of Numerical Features
train_df[numerical_cols].hist(figsize=(15,10),bins=30)
plt.suptitle("Distribution of Numerical Features", fontsize=16)
plt.show()

In [None]:
# Distribution of Categorical Features
for col in categorical_cols:
    plt.figure(figsize=(6,4))
    top_categories=train_df[col].value_counts().nlargest(10).index # Top 10 categories
    sns.countplot(x=col, data=train_df[train_df[col].isin(top_categories)])
    plt.title(f"Top 10 Distribution of {col}")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Target Variable Distribution 
plt.figure(figsize=(8,5))
sns.histplot(train_df["Premium Amount"],bins=50,kde=True)
plt.title("Distribution of Insurance Premium")
plt.show()

In [None]:
plt.figure(figsize=(8,6))

sns.heatmap(
    train_df[numerical_cols].corr(),
    annot=True,
    cmap="coolwarm",
    fmt=".2f",
    annot_kws={"size": 12,"rotation": 45},  # make text readable
    square=True
)

plt.title("Correlation Heatmap", fontsize=16)
plt.tight_layout()
plt.show()


### Step 2: Data Preprocessing

#### 2.1 Handle Missing Values

In [None]:
# Fill numerical missing values
for col in numerical_cols:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    if col in test_df.columns:
        test_df[col].fillna(median_val, inplace=True)

In [None]:
# Fill categorical missing values
for col in categorical_cols:
    mode_val = train_df[col].mode()[0]
    train_df[col].fillna(mode_val, inplace=True)
    if col in test_df.columns:
        test_df[col].fillna(mode_val, inplace=True)

In [None]:
print("Missing values after imputation:\n", train_df.isnull().sum())

In [None]:
# Functions to fix data types
# Numeric
for col in numerical_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    if col in test_df.columns:
        test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

# Dates
if 'Policy Start Date' in train_df.columns:
    train_df['Policy Start Date'] = pd.to_datetime(train_df['Policy Start Date'], errors='coerce')
    test_df['Policy Start Date'] = pd.to_datetime(test_df['Policy Start Date'], errors='coerce')

# Categorical
for col in categorical_cols:
    if col in train_df.columns and col != 'Policy Start Date':
        train_df[col] = train_df[col].astype('object')
    if col in test_df.columns and col != 'Policy Start Date':
        test_df[col] = test_df[col].astype('object')


In [None]:
# Handle skewed numerical features
skewed_cols = ['Annual Income', 'Health Score']
for col in skewed_cols:
    if col in train_df.columns:
        train_df[col] = np.log1p(train_df[col])
    if col in test_df.columns:
        test_df[col] = np.log1p(test_df[col])

In [None]:
sns.boxplot(x=train_df['Previous Claims'])
plt.show()

In [None]:
# Handle outliers in Previous Claims using IQR method
if 'Previous Claims' in train_df.columns:
    Q1 = train_df['Previous Claims'].quantile(0.25)
    Q3 = train_df['Previous Claims'].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    train_df['Previous Claims'] = train_df['Previous Claims'].clip(lower, upper)
    if 'Previous Claims' in test_df.columns:
        test_df['Previous Claims'] = test_df['Previous Claims'].clip(lower, upper)

#### 2.2 Convert Categorical Variables to Numerical Form

In [None]:
# Binary categorical columns (Label Encoding)
binary_cols = ['Gender', 'Smoking Status']
le = LabelEncoder()
for col in binary_cols:
    if col in train_df.columns:
        train_df[col] = le.fit_transform(train_df[col])
    if col in test_df.columns:
        test_df[col] = le.transform(test_df[col])

# Ordinal columns 
education_order = {"High School":1, "Bachelor's":2, "Master's":3, "PhD":4}
exercise_order = {"Rarely":1, "Monthly":2, "Weekly":3, "Daily":4}

if 'Education Level' in train_df.columns:
    train_df['Education Level'] = train_df['Education Level'].map(education_order)
    test_df['Education Level'] = test_df['Education Level'].map(education_order)
if 'Exercise Frequency' in train_df.columns:
    train_df['Exercise Frequency'] = train_df['Exercise Frequency'].map(exercise_order)
    test_df['Exercise Frequency'] = test_df['Exercise Frequency'].map(exercise_order)

# Multi-class categorical columns (One-Hot Encoding)
multi_cols = ['Marital Status', 'Occupation', 'Location', 'Property Type', 'Policy Type']
train_df = pd.get_dummies(train_df, columns=[col for col in multi_cols if col in train_df.columns], drop_first=True)
test_df = pd.get_dummies(test_df, columns=[col for col in multi_cols if col in test_df.columns], drop_first=True)


# Align train and test columns
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

# Drop unnecessary columns
for col in ['Policy Start Date', 'Customer Feedback']:
    if col in train_df.columns:
        train_df.drop(columns=[col], inplace=True)
    if col in test_df.columns:
        test_df.drop(columns=[col], inplace=True)


#### 2.3  Split the Data into Training and Evaluation Sets


In [None]:
# Features and target
X = train_df.drop(columns=['Premium Amount'])
y = train_df['Premium Amount']

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set:", X_train.shape, y_train.shape)
print("Evaluation set:", X_eval.shape, y_eval.shape)

In [None]:
#Handle any remaining missing values in X_train/X_eval
for col in X_train.columns:
    if X_train[col].isnull().sum() > 0:
        if X_train[col].dtype in ['float64','int64']:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_eval[col].fillna(median_val, inplace=True)
        else:
            mode_val = X_train[col].mode()[0]
            X_train[col].fillna(mode_val, inplace=True)
            X_eval[col].fillna(mode_val, inplace=True)

#### 2.4  Feature Scaling

In [None]:
# Select numerical columns to scale (excluding target)
num_cols = ['Age','Annual Income','Number of Dependents','Health Score',
            'Previous Claims','Vehicle Age','Credit Score','Insurance Duration']
num_cols = [col for col in num_cols if col in X_train.columns]

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_eval[num_cols] = scaler.transform(X_eval[num_cols])
test_num_cols = [col for col in num_cols if col in test_df.columns]
test_df[test_num_cols] = scaler.transform(test_df[test_num_cols])

### Step 3: Model Development

### 3.1 Choose Regression Models

##### Linear Regression

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_eval)
results.append(evaluate_model(y_eval, y_pred_lr, "Linear Regression"))


In [None]:
def evaluate_model(y_true, y_pred, model_name="Linear Regression"):
    y_true_clipped = np.maximum(y_true, 0)
    y_pred_clipped = np.maximum(y_pred, 0)
    return {
        "Model": model_name,
        "MAE": mean_absolute_error(y_true, y_pred),
        "MSE": mean_squared_error(y_true, y_pred),
        "RMSE": mean_squared_error(y_true, y_pred, squared=False),
        "RMSLE": np.sqrt(mean_squared_log_error(y_true_clipped, y_pred_clipped)),
        "R2": r2_score(y_true, y_pred)
    }

results = []


##### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_eval)
results.append(evaluate_model(y_eval, y_pred_dt, "Decision Tree"))

##### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=50,       # Reduce number of trees
    max_depth=10,          # Limit depth of trees
    n_jobs=-1,             # Use all CPU cores
    random_state=42
)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_eval)
results.append(evaluate_model(y_eval, y_pred_rf, "Random Forest"))

##### XGBoost Regressor

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(
    n_estimators=100,        # Reduce if still slow
    max_depth=6,             # Limit depth
    learning_rate=0.1,
    subsample=0.8,           # Use 80% of data for each tree
    colsample_bytree=0.8,    # Use 80% of features per tree
    tree_method='hist',      # Faster for large datasets
    n_jobs=-1,               # Use all CPU cores
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_eval)
results.append(evaluate_model(y_eval, y_pred_xgb, "XGBoost"))


In [None]:

# Convert results to DataFrame for easy comparison
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="RMSE")  # Sort by RMSE
print(results_df)

In [None]:
# Best-performing model
best_model_name = results_df.iloc[0]['Model']
print(f"\nBest-performing model: {best_model_name}")

### Step 4: ML Pipeline & MLflow Integration


#### 4.1 Build an ML Pipeline

In [None]:
#%pip install mlflow


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import joblib
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Evaluation function
def evaluate_model(y_true, y_pred):
    y_true_clipped = np.clip(y_true, 0, None)
    y_pred_clipped = np.clip(y_pred, 0, None)
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "MSE": mean_squared_error(y_true, y_pred),
        "RMSE": mean_squared_error(y_true, y_pred, squared=False),
        "RMSLE": np.sqrt(mean_squared_log_error(y_true_clipped, y_pred_clipped)),
        "R2": r2_score(y_true, y_pred)
    }

In [None]:
#  Split features and target 
X = train_df.drop("Premium Amount", axis=1)
y = train_df["Premium Amount"]

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Identify numerical & categorical columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

In [None]:
# Preprocessing pipelines
numerical_transformer = Pipeline([
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numerical_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features)
])

In [None]:
# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=50, max_depth=5, n_jobs=-1, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=50, max_depth=3, n_jobs=-1, random_state=42, verbosity=0)
}

#### 4.2 Track Experiments with MLflow

In [None]:
# Train models, evaluate, and log in MLflow
pipeline_results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    with mlflow.start_run(run_name=name):
        # Train model
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_eval)
        
        # Evaluate
        metrics = evaluate_model(y_eval, y_pred)
        pipeline_results.append({"Model": name, **metrics})
        
        # Log parameters
        if hasattr(model, "get_params"):
            mlflow.log_params(model.get_params())
        
        # Log metrics
        mlflow.log_metrics(metrics)
        
        # Log model tag
        mlflow.set_tag("Model", name)
        
        # Signature and input example
        X_input_example = X_eval.iloc[:5]
        signature = infer_signature(X_input_example, pipe.predict(X_input_example))
        
        # Log pipeline
        mlflow.sklearn.log_model(
            sk_model=pipe,
            name=name.replace(" ", "_") + "_pipeline",
            signature=signature,
            input_example=X_input_example
        )


In [None]:
# Compare models
pipeline_results_df = pd.DataFrame(pipeline_results).sort_values(by="RMSE")
print("\nEvaluation metrics for all models:")
print(pipeline_results_df)


In [None]:
# Save best pipeline
best_model_name = pipeline_results_df.iloc[0]["Model"]
print(f"\nBest model selected: {best_model_name}")

best_model = models[best_model_name]
best_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", best_model)
])
best_pipe.fit(X_train, y_train)

joblib.dump(best_pipe, "best_model.pkl")
print("\nâœ… Best model pipeline saved as best_model.pkl")
