In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text

# Database Connection Setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}

connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Import Data from PostgreSQL
query = """
SELECT
    "policy no",
    "total premium payable",
    "product name",
    "biztype",
    "road side assistance",
    "gst",
    "chassis number",
    "before gst add-on gwp",
    "age",
    "total od premium",
    "new vertical",
    "zone 2",
    "month",
    "enginenumber",
    "total tp premium",
    "policy end date",
    "applicable discount with ncb",
    "policy start date",
    "rto location",
    "vehicle idv",
    "state2",
    "tie up",
    "new branch name 2",
    "manufacturer/make",
    "model",
    "data",
    "ncb % previous year",
    "insured name",
    "fuel type",
    "reg no",
    "variant",
    "Trim Policy No",
    "Policy Tenure(check)",
    "null_count",
    "Cleaned insured name",
    "Cleaned Branch Name 2",
    "Cleaned State2",
    "Cleaned Zone 2",
    "Cleaned Chassis Number",
    "Cleaned Engine Number",
    "Cleaned Reg no",
    "Cleaned insured name_filled",
    "chassis_engine_key",
    "corrected_name",
    "name_similarity",
    "next_policy_start_date",
    "upd_booked",
    "updated_old_policy_no",
    "Policy Status",
    "customerid_base",
    "customerid",
    "Policy Tenure Month",
    "Policy Tenure",
    "Start Year",
    "End Year",
    "Cumulative Tenure (Months)",
    "Tenure Decimal",
    "Customer Tenure",
    "firstpolicyyear",
    "new_customer_id",
    "New Customers",
    "Churn Label",
    "Claim Happaned/Not",
    "Renewal Rate Status",
    "new_chain_flag",
    "chain_group",
    "first_initial_policy_no",
    "policy_wise_purchase",
    "cleaned new vertical",
    "Overall Churned",
    "approved",
    "denied",
    "Claim Status",
    "Number of claims",
    "vehicle segment"
FROM overall_cleaned_data;
"""

with engine.connect() as conn:
    df = pd.read_sql(text(query), con=conn)

# Handle NULL Values

# For 'before gst add-on gwp': Fill nulls with the median value per 'Cleaned Branch Name 2'
df["before gst add-on gwp"] = df.groupby("Cleaned Branch Name 2")["before gst add-on gwp"].transform(
    lambda x: x.fillna(x.median())
)
print("\nMedian values for 'before gst add-on gwp' by 'Cleaned Branch Name 2':")
branch_medians = df.groupby("Cleaned Branch Name 2")["before gst add-on gwp"].median()
print(branch_medians)

# For 'rto location': Fill nulls with the mode per 'Cleaned Branch Name 2'
def fill_mode(x):
    if x.isnull().all():
        return x
    mode_val = x.mode().iloc[0]
    return x.fillna(mode_val)

df["rto location"] = df.groupby("Cleaned Branch Name 2")["rto location"].transform(fill_mode)
print("\nMode values for 'rto location' by 'Cleaned Branch Name 2':")
branch_modes = df.groupby("Cleaned Branch Name 2")["rto location"].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
print(branch_modes)

# For 'variant': Fill nulls with the mode per 'model'
df["variant"] = df.groupby("model")["variant"].transform(fill_mode)
print("\nMode values for 'variant' by 'model':")
variant_modes = df.groupby("model")["variant"].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
print(variant_modes)

# For 'vehicle segment': Fill nulls with the mode per 'model'
df["vehicle segment"] = df.groupby("model")["vehicle segment"].transform(fill_mode)
print("\nMode values for 'vehicle segment' by 'model':")
vehicle_seg_modes = df.groupby("model")["vehicle segment"].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
print(vehicle_seg_modes)

# For numeric columns: Fill nulls with 0
for col in ["approved", "denied", "Claim Status", "Number of claims"]:
    df[col] = df[col].fillna(0)

# Create New Columns

# tie_up_category
tie_up_map = {
    'OEM': ["FORD", "HONDA PV", "HYUNDAI", "JAWA", "JEEP", "MARUTI", "SKODA", "TATA PV", "VOLKSWAGEN", "MIBL OEM"],
    'Non-OEM / Other Auto Industry Partnerships': ["EM Non OE Dealership", "Non-OEM"],
    'Insurance & Financial Brokers': ["EM Agency", "EM Broker", "MIBL Others"]
}

def map_tie_up(x):
    for category, values in tie_up_map.items():
        if x in values:
            return category
    return "Other"

df["tie_up_category"] = df["tie up"].apply(map_tie_up)

# vertical_category (from 'cleaned new vertical')
vertical_map = {
    'Retail Sales & Broking': ["liretailbroking", "liretailsales", "liretailsalescommon", "liprimebroker", "liprimebrokers"],
    'Institutional & Corporate Sales': ["liinstitutionalsales", "liinstitutionalsalescom", "licorporatebroker"],
    'Digital & Virtual Market': ["lionline", "livirtualmarket", "virtualmarket"],
    'Other Operations & Emerging Markets': ["limotor", "lioperations", "liothercommon", "liotheroperations", "liagency", "libancassurance", "liaffinity", "liemergingmarket", "emergingmarket"]
}

def map_vertical(x):
    if pd.isna(x):
        return "Other"
    x_lower = x.lower()
    for category, values in vertical_map.items():
        if x_lower in [val.lower() for val in values]:
            return category
    return "Other"

df["vertical_category"] = df["cleaned new vertical"].apply(map_vertical)

# Area Mapping using an external CSV file
# The CSV file should have columns: "Cleaned Branch Name 2" and "area" (with values like 'rural' or 'urban')
area_mapping = pd.read_csv("Branch_Area_Type.csv")  # <-- update the path to your CSV file
df = df.merge(area_mapping, on="Cleaned Branch Name 2", how="left")
df.rename(columns={"area": "area"}, inplace=True)

# make_category (from 'manufacturer/make')
make_map = {
    'Indian': ["ATUL AUTO", "FORCE MOTORS", "MAHINDRA & MAHINDRA", "MAHINDRA and MAHINDRA", "MAHINDRA ELECTRIC MOBILITY LIMITED", "MAHINDRA RENAULT", "MARUTI", "TATA MOTORS"],
    'Foreign - Mass Market': ["FORD", "HONDA", "HYUNDAI", "NISSAN", "RENAULT", "SKODA", "TOYOTA", "VOLKSWAGEN", "MG MOTOR", "CHEVROLET", "DATSUN", "FIAT", "ISUZU", "KIA", "CITROEN", "BYD", "JEEP"],
    'Foreign - Luxury': ["AUDI", "BMW", "JAGUAR", "LAND ROVER", "LEXUS", "MERCEDES-BENZ", "MINI", "MITSUBISHI", "PORSCHE", "SSANGYONG", "VOLVO"]
}

def map_make(x):
    for category, values in make_map.items():
        if x.upper() in [val.upper() for val in values]:
            return category
    return "Other"

df["make_category"] = df["manufacturer/make"].apply(map_make)

# vehicle_type (from 'vehicle segment')
def map_vehicle_type(x):
    if pd.isna(x):
        return "Other"
    x_upper = x.upper()
    if x_upper == "COMPACT":
        return "Compact"
    elif x_upper in ["MID SIZE", "EXECUTIVE"]:
        return "Mid-Size"
    elif x_upper in ["SUV 1", "SUV 2"]:
        return "SUV"
    elif x_upper in ["HIGH END"]:
        return "Luxury/High-End"
    else:
        return "Other"

df["vehicle_type"] = df["vehicle segment"].apply(map_vehicle_type)

# payement_level based on quantiles of 'total premium payable'
df["payement_level"] = pd.qcut(df["total premium payable"], 3, labels=["Low", "Medium", "High"])
print("\nQuantiles for 'total premium payable':")
print(df["total premium payable"].quantile([0, 0.33, 0.66, 1]))

# discount_level based on quantiles of 'applicable discount with ncb'
df["discount_level"] = pd.qcut(df["applicable discount with ncb"], 3, labels=["Low", "Medium", "High"])
print("\nQuantiles for 'applicable discount with ncb':")
print(df["applicable discount with ncb"].quantile([0, 0.33, 0.66, 1]))

# idv_level based on quantiles of 'vehicle idv'
df["idv_level"] = pd.qcut(df["vehicle idv"], 3, labels=["Low", "Medium", "High"])
print("\nQuantiles for 'vehicle idv':")
print(df["vehicle idv"].quantile([0, 0.33, 0.66, 1]))

# add-on_level based on quantiles of 'before gst add-on gwp'
df["add-on_level"] = pd.qcut(df["before gst add-on gwp"], 3, labels=["Low", "Medium", "High"])
print("\nQuantiles for 'before gst add-on gwp':")
print(df["before gst add-on gwp"].quantile([0, 0.33, 0.66, 1]))

# purchase_type based on 'policy_wise_purchase'
df["purchase_type"] = np.where(df["policy_wise_purchase"] == 1, "New", "Renewal")


Median values for 'before gst add-on gwp' by 'Cleaned Branch Name 2':
Cleaned Branch Name 2
agartala         5582.0
ahmedabad        4864.0
ahmednagar       3248.0
ambala           5281.0
amravati         3302.0
                  ...  
vellore          6060.0
vijayapura       5045.0
vijayawada       5881.5
vishakapatnam    4825.5
warangal         5062.5
Name: before gst add-on gwp, Length: 131, dtype: float64

Mode values for 'rto location' by 'Cleaned Branch Name 2':
Cleaned Branch Name 2
agartala           AGARTALA
ahmedabad         AHMEDABAD
ahmednagar       AHMEDNAGAR
ambala                DELHI
amravati           AMRAVATI
                    ...    
vellore             VELLORE
vijayapura       VIJAYAPURA
vijayawada       VIJAYAWADA
vishakapatnam    VIJAYAWADA
warangal           WARANGAL
Name: rto location, Length: 131, dtype: object

Mode values for 'variant' by 'model':
model
2 SERIES                      GRAN COUPE 220I M SPORT
3 SERIES            GRAN LIMOUSINE 320LD LUXYURY L

In [None]:
df.to_sql("updated_overall_cleaned_data", engine, if_exists="replace", index=False)

print("\nData cleaning and transformation complete. The updated table is saved to the database.")


Data cleaning and transformation complete. The updated table is saved to the database.


In [None]:
with pd.ExcelWriter("median_mode_values.xlsx", engine="xlsxwriter") as writer:
    branch_medians.to_frame("Median before gst add-on gwp").to_excel(writer, sheet_name="Branch Medians")
    branch_modes.to_frame("Mode of rto location").to_excel(writer, sheet_name="Branch Modes")
    variant_modes.to_frame("Mode of variant").to_excel(writer, sheet_name="Variant Modes")
    vehicle_seg_modes.to_frame("Mode of vehicle segment").to_excel(writer, sheet_name="Vehicle Segment Modes")

print("Median and mode values have been saved to 'median_mode_values.xlsx'")

Median and mode values have been saved to 'median_mode_values.xlsx'


In [None]:
pip install XlsxWriter