In [1]:
import pandas as pd
import random
from sqlalchemy import create_engine

# PostgreSQL Connection
db_config = {
    'host': 'localhost',
    'database': 'Liberty',
    'user': 'postgres',
    'password': 'abc',   
    'port': '5432'
}
engine = create_engine(
    f"postgresql://{db_config['user']}:{db_config['password']}@"
    f"{db_config['host']}:{db_config['port']}/{db_config['database']}"
)

# Load Insurance Data
df = pd.read_sql("SELECT * FROM public.insurancedata_with_feedback_and_callcentre", engine)

In [2]:
import re

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean make/model columns
df['make_clean'] = df['manufacturer/make'].apply(clean_name)
df['model_clean'] = df['model'].apply(clean_name)

# Define dimensions & aggregations
dims = ["age", "make_clean", "model_clean", "vehicle idv", "Cleaned State2", "Start Year"]
agg_dict = {
    "total od premium": ["min", "mean", "max"],
    "total tp premium": ["min", "mean", "max"]
}

pricing_catalog = (
    df
      .groupby(dims, dropna=False)
      .agg(agg_dict)
      .reset_index()
)

# Flatten column names
pricing_catalog.columns = [
    "_".join(filter(None, col)).rstrip("_") for col in pricing_catalog.columns
]

In [3]:
pricing_catalog.to_csv("pricing_catalog_output.csv", index=False, encoding="utf-8")

In [3]:
pricing_catalog.to_sql(
    "dim_pricing_catalog",
    engine,
    if_exists="replace",
    index=False,
    chunksize=100000
)

17907

In [1]:
import pandas as pd
import re
from sqlalchemy import create_engine

# PostgreSQL Connection
db_config = {
    'host': 'localhost',
    'database': 'Liberty',
    'user': 'postgres',
    'password': 'abc',   
    'port': '5432'
}
engine = create_engine(
    f"postgresql://{db_config['user']}:{db_config['password']}@"
    f"{db_config['host']}:{db_config['port']}/{db_config['database']}"
)

# Load Insurance Data
df = pd.read_sql("SELECT * FROM public.insurancedata_with_feedback_and_callcentre", engine)

In [2]:
# Clean make/model columns
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

df['make_clean'] = df['manufacturer/make'].apply(clean_name)
df['model_clean'] = df['model'].apply(clean_name)

# Create pricing catalog
group_cols = ["age", "make_clean", "model_clean", "vehicle idv", "Cleaned State2", "Start Year"]
agg_dict = {
    "total od premium": ["min", "mean", "max"],
    "total tp premium": ["min", "mean", "max"]
}

pricing_catalog = (
    df
    .groupby(group_cols, dropna=False)
    .agg(agg_dict)
    .reset_index()
)

# Flatten column names
pricing_catalog.columns = [
    "_".join(filter(None, col)).rstrip("_") for col in pricing_catalog.columns
]

# Merge pricing catalog into raw df
df_merged = pd.merge(
    df,
    pricing_catalog,
    how="left",
    on=["age", "make_clean", "model_clean", "vehicle idv", "Cleaned State2", "Start Year"]
)

In [3]:
df_merged.to_sql(
    "insurancedata_with_fb_cc_pc",
    engine,
    if_exists="replace",
    index=False,
    chunksize=100000
)

830