In [3]:
import pandas as pd 
import numpy as np 


In [5]:
np.random.seed(42)

In [7]:
start_date = "2023-01-01"
end_date = "2024-12-31"
dates= pd.date_range(start = start_date,end = end_date,freq = "D")

num_customers = 3000

vehicle_types = ["Hatchback", "Sedan", "SUV", "Luxury"]
policy_types = ["Third Party", "Comprehensive"]

regions = ["North", "South", "East", "West"]




In [9]:
data = []
for i in range(num_customers):
    policy_id =f"POL{i+10001}"
    
    start_day = np.random.choice(dates)
    policy_term_days = 365

    vehicle_type = np.random.choice(vehicle_types, p=[0.35, 0.30, 0.25, 0.10])
    policy_type = np.random.choice(policy_types, p=[0.4, 0.6])
    region = np.random.choice(regions)

    vehicle_age = np.random.randint(0, 15)
    driver_age = np.random.randint(21, 65)

    base_premium = {
        "Hatchback": 8000,
        "Sedan": 10000,
        "SUV": 13000,
        "Luxury": 18000
    }[vehicle_type]

    # Risk adjustments
    premium = (
        base_premium
        + vehicle_age * 300
        + (driver_age < 25) * 2000
        + (policy_type == "Comprehensive") * 2500
    )

    discount_pct = np.random.choice(
        [0, 5, 10, 15],
        p=[0.45, 0.25, 0.20, 0.10]
    )

    final_premium = premium * (1 - discount_pct / 100)

    expected_claim_cost = final_premium * np.random.uniform(0.4, 0.8)


    data.append([
        policy_id,
        start_day,
        vehicle_type,
        policy_type,
        region,
        vehicle_age,
        driver_age,
        premium,
        discount_pct,
        final_premium,
        expected_claim_cost
    ])




In [11]:
df = pd.DataFrame(
    data ,
    columns = [
        "policy_id",
        "policy_start_date",
        "vehicle_type",
        "policy_type",
        "region",
        "vehicle_age",
        "driver_age",
        "base_premium",
        "discount_pct",
        "final_premium",
        "expected_claim_cost"
    ]
)

df.head()

Unnamed: 0,policy_id,policy_start_date,vehicle_type,policy_type,region,vehicle_age,driver_age,base_premium,discount_pct,final_premium,expected_claim_cost
0,POL10001,2023-04-13,SUV,Third Party,West,12,41,16600,0,16600.0,7675.803615
1,POL10002,2023-11-27,Sedan,Third Party,West,7,23,14100,0,14100.0,11110.291566
2,POL10003,2024-05-06,Luxury,Third Party,West,4,53,19200,0,19200.0,11710.129395
3,POL10004,2024-07-14,Sedan,Third Party,West,11,36,13300,0,13300.0,5802.026232
4,POL10005,2024-07-16,Sedan,Comprehensive,North,2,57,13100,5,12445.0,5826.869088


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   policy_id            3000 non-null   object        
 1   policy_start_date    3000 non-null   datetime64[ns]
 2   vehicle_type         3000 non-null   object        
 3   policy_type          3000 non-null   object        
 4   region               3000 non-null   object        
 5   vehicle_age          3000 non-null   int64         
 6   driver_age           3000 non-null   int64         
 7   base_premium         3000 non-null   int64         
 8   discount_pct         3000 non-null   int32         
 9   final_premium        3000 non-null   float64       
 10  expected_claim_cost  3000 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int32(1), int64(3), object(4)
memory usage: 246.2+ KB


In [15]:
df.isna().sum()

policy_id              0
policy_start_date      0
vehicle_type           0
policy_type            0
region                 0
vehicle_age            0
driver_age             0
base_premium           0
discount_pct           0
final_premium          0
expected_claim_cost    0
dtype: int64

In [17]:
# Remove unrealistic ages
df = df[(df["driver_age"] >= 18) & (df["driver_age"] <= 75)]

# Remove negative premiums (safety check)
df = df[df["final_premium"] > 0]
df.head()

Unnamed: 0,policy_id,policy_start_date,vehicle_type,policy_type,region,vehicle_age,driver_age,base_premium,discount_pct,final_premium,expected_claim_cost
0,POL10001,2023-04-13,SUV,Third Party,West,12,41,16600,0,16600.0,7675.803615
1,POL10002,2023-11-27,Sedan,Third Party,West,7,23,14100,0,14100.0,11110.291566
2,POL10003,2024-05-06,Luxury,Third Party,West,4,53,19200,0,19200.0,11710.129395
3,POL10004,2024-07-14,Sedan,Third Party,West,11,36,13300,0,13300.0,5802.026232
4,POL10005,2024-07-16,Sedan,Comprehensive,North,2,57,13100,5,12445.0,5826.869088


In [19]:
df["policy_year"] =df["policy_start_date"].dt.year
df.head()

Unnamed: 0,policy_id,policy_start_date,vehicle_type,policy_type,region,vehicle_age,driver_age,base_premium,discount_pct,final_premium,expected_claim_cost,policy_year
0,POL10001,2023-04-13,SUV,Third Party,West,12,41,16600,0,16600.0,7675.803615,2023
1,POL10002,2023-11-27,Sedan,Third Party,West,7,23,14100,0,14100.0,11110.291566,2023
2,POL10003,2024-05-06,Luxury,Third Party,West,4,53,19200,0,19200.0,11710.129395,2024
3,POL10004,2024-07-14,Sedan,Third Party,West,11,36,13300,0,13300.0,5802.026232,2024
4,POL10005,2024-07-16,Sedan,Comprehensive,North,2,57,13100,5,12445.0,5826.869088,2024


In [21]:
df["underwriting_margin"] = (
    df["final_premium"] - df["expected_claim_cost"]
)
df.head()

Unnamed: 0,policy_id,policy_start_date,vehicle_type,policy_type,region,vehicle_age,driver_age,base_premium,discount_pct,final_premium,expected_claim_cost,policy_year,underwriting_margin
0,POL10001,2023-04-13,SUV,Third Party,West,12,41,16600,0,16600.0,7675.803615,2023,8924.196385
1,POL10002,2023-11-27,Sedan,Third Party,West,7,23,14100,0,14100.0,11110.291566,2023,2989.708434
2,POL10003,2024-05-06,Luxury,Third Party,West,4,53,19200,0,19200.0,11710.129395,2024,7489.870605
3,POL10004,2024-07-14,Sedan,Third Party,West,11,36,13300,0,13300.0,5802.026232,2024,7497.973768
4,POL10005,2024-07-16,Sedan,Comprehensive,North,2,57,13100,5,12445.0,5826.869088,2024,6618.130912


In [23]:
# Save RAW (reference)
df.to_csv(
    "C:/Users/Abhi/Desktop/Python_Data_Analytics_Projects/car_insurance_pricing_analytics/data/raw/car_insurance_policy_data.csv",
    index=False
)

In [27]:
df.to_csv(
        "C:/Users/Abhi/Desktop/Python_Data_Analytics_Projects/car_insurance_pricing_analytics/data/processed/car_insurance_policy_cleaned.csv",
    index=False
)

In [29]:
df.describe()

Unnamed: 0,policy_start_date,vehicle_age,driver_age,base_premium,discount_pct,final_premium,expected_claim_cost,policy_year,underwriting_margin
count,3000,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,2023-12-30 17:56:38.400000256,6.991,42.621667,14558.633333,4.86,13856.385,8344.786701,2023.507667,5511.598299
min,2023-01-01 00:00:00,0.0,21.0,8000.0,0.0,6800.0,3059.87787,2023.0,1570.831302
25%,2023-06-29 00:00:00,3.0,32.0,12000.0,0.0,11340.0,6360.249763,2023.0,3947.262872
50%,2024-01-06 00:00:00,7.0,42.0,14000.0,5.0,13410.0,7955.509018,2024.0,5194.147604
75%,2024-06-29 06:00:00,11.0,53.0,16600.0,10.0,15980.0,9930.798722,2024.0,6783.747891
max,2024-12-31 00:00:00,14.0,64.0,26700.0,15.0,26100.0,20228.025523,2024.0,13336.512742
std,,4.362866,12.401255,3557.525937,5.117532,3486.813428,2701.824251,0.500025,2106.307093
