In [1]:
pip install faker

Collecting fakerNote: you may need to restart the kernel to use updated packages.

  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.0.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
    --------------------------------------- 0.0/1.9 MB 487.6 kB/s eta 0:00:04
   ------ --------------------------------- 0.3/1.9 MB 2.8 MB/s eta 0:00:01
   ---------------------- ----------------- 1.1/1.9 MB 6.7 MB/s eta 0:00:01
   ---------------------------------------  1.9/1.9 MB 10.1 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 9.4 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.0.0


In [156]:
import pandas as pd
import numpy as np
from faker import Faker
from random import choice, choices, randint, uniform
from datetime import datetime, timedelta

In [157]:
#Faker library
fake = Faker()

# European countries for location
europe_countries = ["Sweden", "Portugal", "Switzerland", "Italy", "Denmark", "Germany", "France", "Netherlands", "Norway"]

# Defining line_of_business and their corresponding range amounts
line_of_business = {
    "Auto": {"premium": (2000, 50000), "claim": (10000, 50000), "expense": (500, 5000)},
    "Engineering": {"premium": (50000, 500000), "claim": (150000, 1500000), "expense": (5000, 35000)},
    "Residence": {"premium": (2000, 100000), "claim": (10000, 200000), "expense": (1000, 10000)},
    "Liability": {"premium": (50000, 500000), "claim": (150000, 1000000), "expense": (10000, 50000)},
    "Transportation": {"premium": (15000, 250000), "claim": (50000, 350000), "expense": (5000, 35000)}
}

# Defining Line_of_Business probabilities across the year - presenting more realistic variations to dataset 
lob_probabilities_by_year = {
    2020: {"Auto": 0.25, "Residence": 0.32, "Transportation": 0.17, "Engineering": 0.13, "Liability": 0.13},
    2021: {"Auto": 0.32, "Residence": 0.28, "Transportation": 0.18, "Engineering": 0.12, "Liability": 0.10},
    2022: {"Auto": 0.30, "Residence": 0.26, "Transportation": 0.20, "Engineering": 0.14, "Liability": 0.10},
    2023: {"Auto": 0.35, "Residence": 0.30, "Transportation": 0.15, "Engineering": 0.10, "Liability": 0.10},
    2024: {"Auto": 0.28, "Residence": 0.25, "Transportation": 0.22, "Engineering": 0.15, "Liability": 0.10}
}

# Channel Distribution
sales_channel_distribution = {
    2020: (50, 50),
    2021: (40, 60),
    2022: (35, 65),
    2023: (55, 45),
    2024: (70, 30)
}

used_policy_ids = set() #Keeping track of Policy_IDs to ensure uniqueness across all years

In [158]:
# Processing Policy Dataset
def generate_policy_data(n, year):
    policies = []

    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)

    for i in range(n):
        policy_id = randint(100000, 999999)
        while policy_id in used_policy_ids:
            policy_id = randint(100000, 999999)
        used_policy_ids.add(policy_id)

        issue_date = fake.date_between(start_date=start_date, end_date=end_date)
        effective_date = issue_date
        end_of_effective_date = effective_date + timedelta(days=365)
        reference_date = datetime(year, 12, 31)

        sales_location = choice(europe_countries)

        # Apply probability distribution for Line of Business selection
        lob_keys = list(lob_probabilities_by_year[year].keys())
        lob_weights = list(lob_probabilities_by_year[year].values())
        line_of_business_type = choices(lob_keys, weights=lob_weights)[0]

        digital_weight, non_digital_weight = sales_channel_distribution[year]
        sales_channel = np.random.choice(["Digital", "Non-Digital"], p = [digital_weight/100, non_digital_weight/100])

        premium_amount = round(uniform(*line_of_business[line_of_business_type]["premium"]), 2)

        policies.append([
            policy_id, issue_date, effective_date, end_of_effective_date, reference_date,
            sales_location, line_of_business_type, sales_channel, premium_amount
        ])

    return policies

In [159]:
# Processing Claim Dataset
def generate_claim_data(policies, year):
    claims = []
    claim_id_counter = 1

    for policy in policies:
        if np.random.rand() < np.random.uniform(0.3, 0.6): #Chance of claims
            policy_id = policy[0] # Extract Policy ID
            occurrence_date = policy[2] + timedelta(days=randint(30, 360))

            # Set claims status probability by Line of Business
            if policy[6] in ["Engineering", "Liability"]:
                claim_status = np.random.choice(["Settled", "Pending"], p=[0.30, 0.70])
            else:
                claim_status = np.random.choice(["Settled", "Pending"], p=[0.40, 0.60])

            # Set probability distribution for payment_date
            if claim_status == "Settled":
                if np.random.rand() < 0.80:
                    payment_date = occurrence_date + timedelta(days=randint(30, 180))
                else:
                    payment_date = occurrence_date + timedelta(days=365)
            else:
                payment_date = None

            # Set probability distribution for claim_amount
            if np.random.rand() < 0.90:
                claim_amount = round(uniform(line_of_business[policy[6]]["claim"][0] * 1.2, line_of_business[policy[6]]["claim"][1] * 0.8), 2)
                expense_amount = round(uniform(line_of_business[policy[6]]["expense"][0] * 1.2, line_of_business[policy[6]]["expense"][1] * 0.8), 2)
            else:
                claim_amount = round(uniform(line_of_business[policy[6]]["claim"][1] * 0.8, line_of_business[policy[6]]["claim"][1]), 2)
                expense_amount = round(uniform(line_of_business[policy[6]]["expense"][1] * 0.8, line_of_business[policy[6]]["expense"][1]), 2)

            claim_id = f"C{year}-{str(claim_id_counter).zfill(4)}" #Format CLaim ID 

            claims.append([claim_id, policy_id, occurrence_date, claim_status, claim_amount, payment_date, expense_amount])
            claim_id_counter += 1

    return claims
            

In [160]:
# Processing functions for databases
years = [2020, 2021, 2022, 2023, 2024]
for year in years:
    policy_data = generate_policy_data(20000, year)
    claim_data = generate_claim_data(policy_data, year)

# To dataframe
    policy_df = pd.DataFrame(policy_data, columns=[
    "Policy_ID", "Issue_Date", "Effective_Date", "End_Of_Effective_Date", "Reference_Date", "Sales_Location",
    "Line_Of_Business", "Sales_Channel", "Premium_Amount"
    ])

    claim_df = pd.DataFrame(claim_data, columns=[
    "Claim_ID", "Policy_ID", "Occurrence_Date", "Claim_Status", "Claim_Amount", "Payment_Date", "Expense_Amount"
    ])

    policy_filename = f"Policy_Dataset_{year}.csv"
    claim_filename = f"Claim_Dataset_{year}.csv"

# Exporting to csv
    policy_df.to_csv(policy_filename, index=False)
    claim_df.to_csv(claim_filename, index=False)

    print(f"Processed datasets and saved {policy_filename} and {claim_filename}")

Processed datasets and saved Policy_Dataset_2020.csv and Claim_Dataset_2020.csv
Processed datasets and saved Policy_Dataset_2021.csv and Claim_Dataset_2021.csv
Processed datasets and saved Policy_Dataset_2022.csv and Claim_Dataset_2022.csv
Processed datasets and saved Policy_Dataset_2023.csv and Claim_Dataset_2023.csv
Processed datasets and saved Policy_Dataset_2024.csv and Claim_Dataset_2024.csv
