# Master Notebook: Predicting Settlement Values

This notebook provides a self-contained, reproducible pipeline for:
- **Data Ingestion & Preprocessing**
- **Model Training & Hyperparameter Tuning**
- **Fairness Analysis**
- **Explainability with SHAP**
- **Interactive Prediction Demo**

> Adjust file paths and column names as needed to match your project structure.


## 1. Data Ingestion & Preprocessing


In [14]:
import pandas as pd
import numpy as np

# Load dataset (update path as needed)
raw_data_path = '../data/Synthetic_Data_For_Students.csv'
df_raw = pd.read_csv(raw_data_path)
df_raw.head()


Unnamed: 0,SettlementValue,AccidentType,Injury_Prognosis,SpecialHealthExpenses,SpecialReduction,SpecialOverage,GeneralRest,SpecialAdditionalInjury,SpecialEarningsLoss,SpecialUsageLoss,...,Accident Date,Claim Date,Vehicle Age,Driver Age,Number of Passengers,Accident Description,Injury Description,Police Report Filed,Witness Present,Gender
0,520.0,Rear end,E. 5 months,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2023-11-10 11:22:24.508901,2024-06-11 11:22:24.508901,13.0,33.0,4.0,Side collision at an intersection.,Whiplash and minor bruises.,Yes,Yes,Male
1,870.0,Rear end,B. 2 months,0.0,0.0,0.0,520.0,0.0,0.0,90.0,...,2023-06-25 00:55:01.140228,2024-01-09 00:55:01.140228,4.0,45.0,2.0,Side collision at an intersection.,Minor cuts and scrapes.,Yes,Yes,Female
2,2140.0,Other side pulled out of side road,G. 7 months,0.0,0.0,0.0,1400.0,0.0,0.0,0.0,...,2020-02-23 17:43:47.805561,2020-03-01 17:43:47.805561,9.0,45.0,4.0,Lost control on a snowy road.,Whiplash and minor bruises.,Yes,No,Female
3,520.0,Rear end - Clt pushed into next vehicle,D. 4 months,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2021-10-02 04:36:32.118423,2021-10-13 04:36:32.118423,5.0,62.0,1.0,Side collision at an intersection.,Minor cuts and scrapes.,Yes,Yes,Female
4,260.0,Rear end,C. 3 months,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2023-04-02 05:13:07.117423,2023-04-14 05:13:07.117423,9.0,78.0,1.0,Lost control on a snowy road.,Concussion and bruised ribs.,Yes,Yes,Other


## 2. Feature Engineering & Preprocessing

Apply imputation, scaling, and encoding to numeric and categorical features.


In [15]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- 1. Load & inspect ---
raw_data_path = '../data/Synthetic_Data_For_Students.csv'   # ← adjust path/name as needed
df_raw = pd.read_csv(raw_data_path)

# --- 2. Clean column names & list them ---
df_raw.columns = df_raw.columns.str.strip()
print("Columns in dataset:", df_raw.columns.tolist())

# --- 3. Auto-detect & set target column ---
possible_targets = [c for c in df_raw.columns if 'settlement' in c.lower()]
if len(possible_targets) == 1:
    target_col = possible_targets[0]
    print(f"Using '{target_col}' as the target column.")
else:
    raise ValueError(f"Ambiguous/No target found. Please set `target_col` manually from: {possible_targets}")

# --- 4. Define numeric & categorical features ---
num_cols = df_raw.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove(target_col)
cat_cols = df_raw.select_dtypes(include=['object', 'category']).columns.tolist()

# --- 5. Build & apply preprocessing pipeline ---
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), num_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_cols)
])

X = df_raw.drop(columns=[target_col])
y = df_raw[target_col].values
X_processed = preprocessor.fit_transform(X)
print('Processed feature matrix shape:', X_processed.shape)


Columns in dataset: ['SettlementValue', 'AccidentType', 'Injury_Prognosis', 'SpecialHealthExpenses', 'SpecialReduction', 'SpecialOverage', 'GeneralRest', 'SpecialAdditionalInjury', 'SpecialEarningsLoss', 'SpecialUsageLoss', 'SpecialMedications', 'SpecialAssetDamage', 'SpecialRehabilitation', 'SpecialFixes', 'GeneralFixed', 'GeneralUplift', 'SpecialLoanerVehicle', 'SpecialTripCosts', 'SpecialJourneyExpenses', 'SpecialTherapy', 'Exceptional_Circumstances', 'Minor_Psychological_Injury', 'Dominant injury', 'Whiplash', 'Vehicle Type', 'Weather Conditions', 'Accident Date', 'Claim Date', 'Vehicle Age', 'Driver Age', 'Number of Passengers', 'Accident Description', 'Injury Description', 'Police Report Filed', 'Witness Present', 'Gender']
Using 'SettlementValue' as the target column.
Processed feature matrix shape: (5000, 8120)


## 3. Model Training & Hyperparameter Tuning


In [16]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Try to import XGBoost, otherwise fallback
try:
    from xgboost import XGBRegressor
    use_xgb = True
    print("Using XGBRegressor from xgboost")
except ModuleNotFoundError:
    XGBRegressor = GradientBoostingRegressor
    use_xgb = False
    print("xgboost not found; falling back to GradientBoostingRegressor")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# Configure models and parameter grids
models = {
    'DecisionTree': (
        DecisionTreeRegressor(random_state=42),
        {'max_depth': [3, 5, 7]}
    ),
    # Use XGB if available, otherwise GradientBoosting
    'XGB' if use_xgb else 'GradBoost': (
        XGBRegressor(random_state=42, verbosity=0) if use_xgb 
        else GradientBoostingRegressor(random_state=42),
        {'n_estimators': [50, 100], 'max_depth': [3, 5]} 
        if use_xgb else {'n_estimators': [50, 100], 'max_depth': [3, 5]}
    ),
    'RandomForest': (
        RandomForestRegressor(random_state=42),
        {'n_estimators': [50, 100], 'max_depth': [5, 10]}
    )
}

best_models = {}
for name, (model, grid) in models.items():
    search = GridSearchCV(
        model, grid,
        scoring='neg_mean_absolute_percentage_error',
        cv=5, n_jobs=-1
    )
    search.fit(X_train, y_train)
    best = search.best_estimator_
    best_models[name] = best
    
    preds = best.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, preds)
    print(f"{name}: MAPE={mape:.4f}, Best Params={search.best_params_}")


xgboost not found; falling back to GradientBoostingRegressor


ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aasri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aasri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aasri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\tree\_classes.py", line 1377, in fit
    super()._fit(
  File "C:\Users\aasri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\tree\_classes.py", line 252, in _fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aasri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 648, in _validate_data
    y = check_array(y, input_name="y", **check_y_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aasri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\utils\validation.py", line 1064, in check_array
    _assert_all_finite(
  File "C:\Users\aasri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\utils\validation.py", line 123, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "C:\Users\aasri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\utils\validation.py", line 172, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input y contains NaN.


## 4. Fairness Analysis

Compute MAPE across subgroups (age × gender).


In [None]:
# Reconstruct test DataFrame
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

# If sparse matrix, convert to dense
X_test_arr = X_test.toarray() if hasattr(X_test, 'toarray') else X_test

# Column names after preprocessing
cat_feats = preprocessor.named_transformers_['cat'] \
    .named_steps['encoder'].get_feature_names_out(cat_cols)
feature_names = num_cols + list(cat_feats)

df_test = pd.DataFrame(X_test_arr, columns=feature_names)
df_test['true_settlement'] = y_test

# Retrieve subgroup columns from original df
# (ensure indices are aligned; if not, reset/reindex appropriately)
orig_test = df_raw.iloc[y_test.index] if hasattr(y_test, 'index') else df_raw.iloc[X_test_arr.shape[0]*-1:]
df_test['age'] = orig_test['age'].values
df_test['gender'] = orig_test['gender'].values

# Compute subgroup MAPE for XGBoost
for (age_val, gender_val), grp in df_test.groupby(['age', 'gender']):
    preds = best_models['XGBoost'].predict(preprocessor.transform(grp[feature_names]))
    mape = mean_absolute_percentage_error(grp['true_settlement'], preds)
    print(f"XGBoost MAPE (age={age_val}, gender={gender_val}): {mape:.4f}")


## 5. Explainability

Generate SHAP summaries for the XGBoost model.


In [None]:
import shap

# Create an explainer
explainer = shap.Explainer(best_models['XGBoost'], X_train)
shap_values = explainer(X_test)

# Visualise
shap.summary_plot(shap_values, features=X_test, feature_names=feature_names)


## 6. Interactive Prediction Demo


In [None]:
def predict_and_explain(record: dict):
    """
    Given a dict of feature values, returns prediction and SHAP values.
    """
    df_rec = pd.DataFrame([record])
    X_rec = preprocessor.transform(df_rec)
    pred = best_models['XGBoost'].predict(X_rec)[0]
    shap_vals = explainer(X_rec)
    print(f"Prediction: {pred:.2f}")
    display(shap_vals)  # Jupyter will render SHAP output
    return pred, shap_vals

# Example usage:
# sample = {col: X_test[0, idx] for idx, col in enumerate(feature_names)}
# predict_and_explain(sample)
