## Car Insurance Claims Prediction Using Random Forest
  - Class Imbalance corrected as well

Dataset Source: https://www.kaggle.com/datasets/ifteshanajnin/carinsuranceclaimprediction-classification?select=train.csv

##### Import Necessary Libraries

In [0]:
import re

import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, TimestampType, FloatType, BooleanType

from pyspark.ml import Pipeline
from pyspark.ml.feature import VarianceThresholdSelector, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import MinMaxScaler, RobustScaler, MaxAbsScaler, StandardScaler, VarianceThresholdSelector, IndexToString

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

##### Versions of Libraries, Modules, Frameworks Used in This Project

In [0]:
print("Apache Spark version:", spark.version)

Apache Spark version: 3.3.1


#### Create Functions Used Throughout This Project

##### Create Function to Ingest Dataset

In [0]:
def ingest_dataset(file_location: str, \
                   schema: StructType \
                  ) -> pyspark.sql.dataframe.DataFrame:
    '''
    This function returns a dataset aligned with the schema
    '''
    file_type = "csv"
    infer_schema = "false"
    first_row_is_header = "true"
    delimiter = ","
    
    df = spark.read.format(file_type) \
      .option("inferSchema", infer_schema) \
      .option("header", first_row_is_header) \
      .option("sep", delimiter) \
      .schema(schema)\
      .load(file_location)
    
    return df

##### Function to Remove Class Imbalance

In [0]:
def balance_dataset(dataset, unique_label_values, new_schema, samples = 20000, label_col = "label"):
    temp_df =  spark.createDataFrame([], new_schema)
    new_df =  spark.createDataFrame([], new_schema)
    
    for ulab in unique_label_values:
        # extract df of only the desired labels
        temp_df = dataset.where(F.col(label_col).isin(ulab))
        
        ratio = round(samples/temp_df.count(), 4)
        # sample it to desired number of samples
        if ratio > 1.0:
            # Oversample
            temp_df = temp_df.sample(True, ratio, seed=42)
        elif ratio < 1.0:
            # Undersample
            temp_df = temp_df.sample(False, ratio, seed=42)
        elif ratio == 1.0:
            # Just use the existing dataframe
            pass
        # concatenate it to the new_df
        new_df = new_df.union(temp_df)
        # unpersist the temp_df
        temp_df.unpersist()
    return new_df

##### Create Function to Create Pipeline

In [0]:
def prepare_data(dataset: pyspark.sql.dataframe.DataFrame, \
                         all_features: [str], \
                         categorical_columns: [str], \
                         numerical_columns: [str], \
                         label = "label" \
                ) -> {}:
    '''
    This function creates the pipeline stages for classification analysis.
    '''
    
    # Define variables used within function
    stages = []
        
    # Prepare categorical features
    for cat in categorical_columns:
        stringIndexer = StringIndexer(inputCol = cat, outputCol = cat + "_index")
        encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],\
                                outputCols=[cat + "_class_vec"])
        stages += [stringIndexer]
        stages += [encoder]
    
    # Use the vector assembler
    assembler_inputs = [c + "_class_vec" for c in categorical_columns] + numerical_columns
    assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")
    stages += [assembler]
    
    # using the pipeline created above, prepare the data for the model
    data_pipe = Pipeline().setStages(stages)
    data_fitted = data_pipe.fit(dataset)
    prepped_ds = data_fitted.transform(dataset)
    
    return prepped_ds

##### Create Metrics Evaluation Function

In [0]:
def evaluate_with_spark_metrics(metrics: [str], \
                                model_name: str \
                               ) -> None:
    '''
    Calculate & display metrics for a multiclass classification analysis.
    '''
    print("+---------------------------------------------+")
    print("|  " + model_name.center(41) + "  |")
    print("+---------------------------------------------+")
    print("|   %s  |  %s   |" % ("Metric".rjust(20), "Value".ljust(14)))
    print("+---------------------------------------------+")
    for x in metrics:
        evaluator = MulticlassClassificationEvaluator(labelCol="label", \
                                                      predictionCol="prediction", \
                                                      metricName=x) 
        score = evaluator.evaluate(predictions)
        print("|   %s  |  %s   |" % (x.rjust(20), str(round(score, 6)).ljust(14)))
        print("+---------------------------------------------+")

#### Ingest & Preprocess Datasets

##### Ingest Training Dataset

In [0]:
ds_file = "/FileStore/tables/train-1.csv"

ds_schema = StructType([
    StructField("policy_id", StringType(), True),
    StructField("policy_tenure", FloatType(), True),
    StructField("age_of_car", FloatType(), True),
    StructField("age_of_policyholder", FloatType(), True),
    StructField("area_cluster", StringType(), True),
    StructField("population_density", IntegerType(), True),
    StructField("make", IntegerType(), True),
    StructField("segment", StringType(), True),
    StructField("model", StringType(), True),
    StructField("fuel_type", StringType(), True),
    StructField("max_torque", StringType(), True),
    StructField("max_power", StringType(), True),
    StructField("engine_type", StringType(), True),
    StructField("airbags", IntegerType(), True),
    StructField("is_esc", StringType(), True),
    StructField("is_adjustable_steering", StringType(), True),
    StructField("is_tpms", StringType(), True),
    StructField("is_parking_sensors", StringType(), True),
    StructField("is_parking_camera", StringType(), True),
    StructField("rear_brakes_type", StringType(), True),
    StructField("displacement", IntegerType(), True),
    StructField("cylinder", IntegerType(), True),
    StructField("transmission_type", StringType(), True),
    StructField("gear_box", IntegerType(), True),
    StructField("steering_type", StringType(), True),
    StructField("turning_radius", FloatType(), True), 
    StructField("length", IntegerType(), True),
    StructField("width", IntegerType(), True),
    StructField("height", IntegerType(), True),
    StructField("gross_weight", IntegerType(), True),
    StructField("is_front_fog_lights", StringType(), True),
    StructField("is_rear_window_wiper", StringType(), True),
    StructField("is_rear_window_washer", StringType(), True),
    StructField("is_rear_window_defogger", StringType(), True),
    StructField("is_brake_assist", StringType(), True),
    StructField("is_power_door_locks", StringType(), True),
    StructField("is_central_locking", StringType(), True),
    StructField("is_power_steering", StringType(), True),
    StructField("is_driver_seat_height_adjustable", StringType(), True),
    StructField("is_day_night_rear_view_mirror", StringType(), True),
    StructField("is_ecw", StringType(), True),
    StructField("is_speed_alert", StringType(), True),
    StructField("ncap_rating", IntegerType(), True),
    StructField("label", IntegerType(), True),
])

ds = ingest_dataset(ds_file, ds_schema)

print(ds.printSchema())

root
 |-- policy_id: string (nullable = true)
 |-- policy_tenure: float (nullable = true)
 |-- age_of_car: float (nullable = true)
 |-- age_of_policyholder: float (nullable = true)
 |-- area_cluster: string (nullable = true)
 |-- population_density: integer (nullable = true)
 |-- make: integer (nullable = true)
 |-- segment: string (nullable = true)
 |-- model: string (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- max_torque: string (nullable = true)
 |-- max_power: string (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- airbags: integer (nullable = true)
 |-- is_esc: string (nullable = true)
 |-- is_adjustable_steering: string (nullable = true)
 |-- is_tpms: string (nullable = true)
 |-- is_parking_sensors: string (nullable = true)
 |-- is_parking_camera: string (nullable = true)
 |-- rear_brakes_type: string (nullable = true)
 |-- displacement: integer (nullable = true)
 |-- cylinder: integer (nullable = true)
 |-- transmission_type: string (nullable

##### Data Preprocessing

In [0]:
all_col_names = ds.columns

columns_to_update = [x for x in all_col_names if re.search("^is_", str(x))]

replacement_values = {"Yes" : "1", "No" : "0"}
ds = ds.replace(replacement_values, subset=columns_to_update)

ds = ds.drop("policy_id")

display(ds)

policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,max_power,engine_type,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,rear_brakes_type,displacement,cylinder,transmission_type,gear_box,steering_type,turning_radius,length,width,height,gross_weight,is_front_fog_lights,is_rear_window_wiper,is_rear_window_washer,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,label
0.5158736,0.05,0.6442308,C1,4990,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,0,0,0,1,0,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,0,0,0,0,0,0,0,1,0,0,0,1,0,0
0.6726185,0.02,0.375,C2,27003,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,0,0,0,1,0,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,0,0,0,0,0,0,0,1,0,0,0,1,0,0
0.8411102,0.02,0.3846154,C3,4076,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,0,0,0,1,0,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,0,0,0,0,0,0,0,1,0,0,0,1,0,0
0.90027654,0.11,0.43269232,C4,21622,1,C1,M2,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,1.2 L K12N Dualjet,2,1,1,0,1,1,Drum,1197,4,Automatic,5,Electric,4.8,3995,1735,1515,1335,1,0,0,1,1,1,1,1,1,1,1,1,2,0
0.5964028,0.11,0.63461536,C5,34738,2,A,M3,Petrol,91Nm@4250rpm,67.06bhp@5500rpm,1.0 SCe,2,0,0,0,0,1,Drum,999,3,Automatic,5,Electric,5.0,3731,1579,1490,1155,0,0,0,0,0,1,1,1,0,1,1,1,2,0
1.0187086,0.07,0.5192308,C6,13051,3,C2,M4,Diesel,250Nm@2750rpm,113.45bhp@4000rpm,1.5 L U2 CRDi,6,1,1,1,1,1,Disc,1493,4,Automatic,6,Power,5.2,4300,1790,1635,1720,1,1,1,1,1,1,1,1,1,0,1,1,3,0
0.09799216,0.16,0.40384614,C7,6112,4,B2,M5,Diesel,200Nm@3000rpm,88.77bhp@4000rpm,1.5 Turbocharged Revotorq,2,0,1,0,1,0,Drum,1497,4,Manual,5,Electric,5.0,3990,1755,1523,1490,0,0,0,0,0,1,1,1,0,0,1,1,5,0
0.50908464,0.14,0.42307693,C8,8794,1,B2,M6,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,K Series Dual jet,2,0,1,0,1,0,Drum,1197,4,Manual,5,Electric,4.8,3845,1735,1530,1335,1,0,0,0,1,1,1,1,1,1,1,1,2,0
0.28239352,0.07,0.29807693,C7,6112,3,C2,M4,Diesel,250Nm@2750rpm,113.45bhp@4000rpm,1.5 L U2 CRDi,6,1,1,1,1,1,Disc,1493,4,Automatic,6,Power,5.2,4300,1790,1635,1720,1,1,1,1,1,1,1,1,1,0,1,1,3,0
0.56625545,0.04,0.44230768,C9,17804,1,B2,M7,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,1.2 L K Series Engine,6,1,1,0,1,1,Drum,1197,4,Automatic,5,Electric,4.85,3990,1745,1500,1410,1,1,1,1,1,1,1,1,1,1,1,1,0,0


Output can only be rendered in Databricks

##### Remove Class Imbalance

In [0]:
bal_schema = StructType([
    StructField("policy_tenure", FloatType(), True),
    StructField("age_of_car", FloatType(), True),
    StructField("age_of_policyholder", FloatType(), True),
    StructField("area_cluster", StringType(), True),
    StructField("population_density", IntegerType(), True),
    StructField("make", IntegerType(), True),
    StructField("segment", StringType(), True),
    StructField("model", StringType(), True),
    StructField("fuel_type", StringType(), True),
    StructField("max_torque", StringType(), True),
    StructField("max_power", StringType(), True),
    StructField("engine_type", StringType(), True),
    StructField("airbags", IntegerType(), True),
    StructField("is_esc", StringType(), True),
    StructField("is_adjustable_steering", StringType(), True),
    StructField("is_tpms", StringType(), True),
    StructField("is_parking_sensors", StringType(), True),
    StructField("is_parking_camera", StringType(), True),
    StructField("rear_brakes_type", StringType(), True),
    StructField("displacement", IntegerType(), True),
    StructField("cylinder", IntegerType(), True),
    StructField("transmission_type", StringType(), True),
    StructField("gear_box", IntegerType(), True),
    StructField("steering_type", StringType(), True),
    StructField("turning_radius", FloatType(), True), 
    StructField("length", IntegerType(), True),
    StructField("width", IntegerType(), True),
    StructField("height", IntegerType(), True),
    StructField("gross_weight", IntegerType(), True),
    StructField("is_front_fog_lights", StringType(), True),
    StructField("is_rear_window_wiper", StringType(), True),
    StructField("is_rear_window_washer", StringType(), True),
    StructField("is_rear_window_defogger", StringType(), True),
    StructField("is_brake_assist", StringType(), True),
    StructField("is_power_door_locks", StringType(), True),
    StructField("is_central_locking", StringType(), True),
    StructField("is_power_steering", StringType(), True),
    StructField("is_driver_seat_height_adjustable", StringType(), True),
    StructField("is_day_night_rear_view_mirror", StringType(), True),
    StructField("is_ecw", StringType(), True),
    StructField("is_speed_alert", StringType(), True),
    StructField("ncap_rating", IntegerType(), True),
    StructField("label", IntegerType(), True),
])

unique_label_values = [0, 1]

df = balance_dataset(ds, \
                    unique_label_values, \
                    bal_schema, \
                    10000)
                     
display(df)

policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,max_power,engine_type,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,rear_brakes_type,displacement,cylinder,transmission_type,gear_box,steering_type,turning_radius,length,width,height,gross_weight,is_front_fog_lights,is_rear_window_wiper,is_rear_window_washer,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,label
0.50908464,0.14,0.42307693,C8,8794,1,B2,M6,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,K Series Dual jet,2,0,1,0,1,0,Drum,1197,4,Manual,5,Electric,4.8,3845,1735,1530,1335,1,0,0,0,1,1,1,1,1,1,1,1,2,0
0.24142125,0.12,0.47115386,C9,17804,5,C1,M9,Diesel,200Nm@1750rpm,97.89bhp@3600rpm,i-DTEC,2,0,1,0,1,1,Drum,1498,4,Manual,5,Electric,4.9,3995,1695,1501,1051,1,0,0,1,0,1,1,1,1,1,1,1,4,0
0.30170158,0.05,0.40384614,C11,6108,3,C2,M4,Diesel,250Nm@2750rpm,113.45bhp@4000rpm,1.5 L U2 CRDi,6,1,1,1,1,1,Disc,1493,4,Automatic,6,Power,5.2,4300,1790,1635,1720,1,1,1,1,1,1,1,1,1,0,1,1,3,0
0.79657847,0.02,0.46153846,C1,4990,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,0,0,0,1,0,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,0,0,0,0,0,0,0,1,0,0,0,1,0,0
0.8447417,0.04,0.39423078,C3,4076,4,B2,M5,Diesel,200Nm@3000rpm,88.77bhp@4000rpm,1.5 Turbocharged Revotorq,2,0,1,0,1,0,Drum,1497,4,Manual,5,Electric,5.0,3990,1755,1523,1490,0,0,0,0,0,1,1,1,0,0,1,1,5,0
1.0557545,0.1,0.9230769,C8,8794,1,B2,M6,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,K Series Dual jet,2,0,1,0,1,0,Drum,1197,4,Manual,5,Electric,4.8,3845,1735,1530,1335,1,0,0,0,1,1,1,1,1,1,1,1,2,0
0.16781501,0.0,0.48076922,C5,34738,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,0,0,0,1,0,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,0,0,0,0,0,0,0,1,0,0,0,1,0,0
0.9430377,0.12,0.40384614,C14,7788,1,B2,M6,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,K Series Dual jet,2,0,1,0,1,0,Drum,1197,4,Manual,5,Electric,4.8,3845,1735,1530,1335,1,0,0,0,1,1,1,1,1,1,1,1,2,0
0.24536183,0.04,0.32692307,C15,290,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,0,0,0,1,0,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,0,0,0,0,0,0,0,1,0,0,0,1,0,0
0.9766327,0.04,0.39423078,C14,7788,1,B2,M6,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,K Series Dual jet,2,0,1,0,1,0,Drum,1197,4,Manual,5,Electric,4.8,3845,1735,1530,1335,1,0,0,0,1,1,1,1,1,1,1,1,2,0


#### Create & Fit/Train Data Pipeline

##### Separate Features into Lists Based on Respective Data Type

In [0]:
all_features = [x for x in df.columns]
numerical_features = [x for x in df.columns if df.select(x).distinct().count() > 8]

for x in ['model', 'max_torque', 'max_power', 'engine_type', 'area_cluster']:
    numerical_features.remove(x)

numerical_features.append("ncap_rating")

categorical_features = [x for x in df.columns if x not in numerical_features]
categorical_features.remove("label")

##### Fit & Transform Data Pipeline

In [0]:
prepped_ds = prepare_data(df, all_features, categorical_features, numerical_features, label="label")

##### Split Dataset into Training & Testing Datasets (80/20)

In [0]:
train_ds, test_ds = prepped_ds.randomSplit(weights=[0.80, 0.20], seed=42)

train_ds = train_ds.persist()
test_ds = test_ds.persist()

print(f"Samples in Training Dataset: {train_ds.count()}")
print(f"Samples in Testing Dataset: {test_ds.count()}")

Samples in Training Dataset: 16321
Samples in Testing Dataset: 4029


##### Define Random Forest Classifier Model For Training

In [0]:
rf = RandomForestClassifier(featuresCol = 'features', \
                            labelCol = 'label')

##### Train Random Forest Classifier Model

In [0]:
rf_model = rf.fit(train_ds)

##### Predictions Using Testing Dataset

In [0]:
predictions = rf_model.transform(test_ds)

#### Model Evaluation

##### Prepare Predictions for Evaluation Function

In [0]:
preds_in_pandas = predictions.select(\
                     F.col('prediction').alias("prediction"), \
                     F.col('label').alias("label"))

preds_in_pandas = preds_in_pandas\
                    .withColumn("prediction", F.col("prediction").cast(IntegerType()))\
                    .withColumn("label", F.col("label").cast(IntegerType()))

display(preds_in_pandas)

prediction,label
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0


In [0]:
metrics_for_evaluation = ["accuracy", "f1", "weightedPrecision", "weightedRecall", "weightedFMeasure"]

evaluate_with_spark_metrics(metrics_for_evaluation, "Insurance Predictions with Random Forest")

+---------------------------------------------+
|   Insurance Predictions with Random Forest  |
+---------------------------------------------+
|                 Metric  |  Value            |
+---------------------------------------------+
|               accuracy  |  0.627203         |
+---------------------------------------------+
|                     f1  |  0.62587          |
+---------------------------------------------+
|      weightedPrecision  |  0.628237         |
+---------------------------------------------+
|         weightedRecall  |  0.627203         |
+---------------------------------------------+
|       weightedFMeasure  |  0.62587          |
+---------------------------------------------+


##### End Spark Session

In [0]:
train_ds = train_ds.unpersist()
test_ds = test_ds.unpersist()

spark.stop()

### Notes & Other Takeaways From This Project
****
- This is interesting. Even though the Random Forest Model on the imbalanced classes achieved far superior values for accuracy, f1, precision and recall, there was a bit of a difference between the weighted recall and weighted precision (~0.935 to ~0.875, respectively). The true question is whether that difference is enough to remove the class imbalance prior to training.
****