In [1]:
import pandas as pd
from sqlalchemy import create_engine

# Function to standardize column names (strip spaces, lowercase, remove extra spaces)
def clean_columns(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(r'\s+', ' ', regex=True)
    return df

In [2]:
##############################
# Load and Clean Base Datasets
##############################

# --- For Base 2022 ---
base_2022 = pd.read_excel("Pvt Car Jan to Dec 22(22 Base).xlsx")
# Add the data column first
base_2022["data"] = "2022_base"
# Now clean the columns (the added "data" column remains lower-case)
base_2022 = clean_columns(base_2022)

# --- For Base 2023 ---
base_2023 = pd.read_excel("Pvt Car Jan'23 to  Dec 23 base Final.xlsx")
base_2023["data"] = "2023_base"
base_2023 = clean_columns(base_2023)

# --- For Base 2024 ---
base_2024 = pd.read_excel("cleaned_2024_Base_dataset.xlsx")
base_2024["data"] = "2024_base"
base_2024 = clean_columns(base_2024)

# (Optional) Ensure that all Base dataframes include a "data" column
for df in [base_2022, base_2023, base_2024]:
    if "data" not in df.columns:
        df["data"] = None

# Find common columns across all Base datasets.
# (Because we added the "data" column before cleaning, it is already part of the intersection.)
common_base_columns = list(set(base_2022.columns) & set(base_2023.columns) & set(base_2024.columns))

# Merge all Base datasets while keeping only the common columns.
base_merged = pd.concat(
    [df[common_base_columns] for df in [base_2022, base_2023, base_2024]], 
    ignore_index=True
)


In [3]:
##############################
# Load and Clean PR Datasets
##############################

# --- For PR 2022 ---
pr_2022 = pd.read_csv("cleaned_2022_PR.csv")
pr_2022["data"] = "2022_pr"
pr_2022 = clean_columns(pr_2022)

# --- For PR 2023 ---
pr_2023 = pd.read_excel("cleaned_PR dataset.xlsx")  # This file contains "old policy no"
pr_2023["data"] = "2023_pr"
pr_2023 = clean_columns(pr_2023)

# --- For PR 2024 ---
pr_2024 = pd.read_excel("cleaned_2024_PR_tie_up.xlsx")
pr_2024["data"] = "2024_pr"
pr_2024 = clean_columns(pr_2024)

# Find common columns across PR datasets.
common_pr_columns = list(set(pr_2022.columns) & set(pr_2023.columns) & set(pr_2024.columns))

# Merge all PR datasets while keeping only the common columns.
pr_merged = pd.concat(
    [df[common_pr_columns] for df in [pr_2022, pr_2023, pr_2024]],
    ignore_index=True
)

# Ensure the PR dataset follows the Base structure (add missing columns as NaN).
for col in common_base_columns:
    if col not in pr_merged.columns:
        pr_merged[col] = None

# If "old policy no" exists in the PR 2023 file and "policy no" exists in the merged PR,
# merge it into pr_merged.
if "old policy no" in pr_2023.columns and "policy no" in pr_merged.columns:
    pr_merged = pr_merged.merge(
        pr_2023[['policy no', 'old policy no']], 
        on='policy no', 
        how='left'
    )

# Ensure the Base dataset has an "old policy no" column as well.
if "old policy no" not in base_merged.columns:
    base_merged["old policy no"] = None

# Merge Base and PR datasets.
final_merged = pd.concat([base_merged, pr_merged], ignore_index=True)

  pr_2022 = pd.read_csv("cleaned_2022_PR.csv")
  final_merged = pd.concat([base_merged, pr_merged], ignore_index=True)


In [5]:
##############################
# Write the Final Data to PostgreSQL
##############################

# Database connection details
db_username = 'postgres'
db_password = 'kaviyam123'
db_host = 'localhost'  
db_port = '5432'
db_name = 'postgres'

# Create a connection to PostgreSQL
engine = create_engine(f'postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}')

# Write the merged DataFrame to PostgreSQL
final_merged.to_sql('appended_base_and_pr', engine, if_exists='replace', index=False, chunksize=100000)

print("Merged data successfully written to PostgreSQL.")


Merged data successfully written to PostgreSQL.
