In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM cleaned_test_data_06_12;"
df = pd.read_sql(query, con=engine)

# Step 2: Create a `Match` column for all rows in the dataset
policy_no_with_b = df[df['Type'] == 'B']['Policy No'].unique()
df['Match'] = df['Policy No'].isin(policy_no_with_b).apply(lambda x: 'Matches B' if x else None)

# Step 3: Clean column names
df.rename(columns=lambda x: x.strip(), inplace=True)

# Step 4: Separate Numeric and Non-Numeric Rows in `Total Premium Payable`
# Fill nulls with empty strings for processing
df['Total Premium Payable'] = df['Total Premium Payable'].fillna('').astype(str)

# Identify non-numeric rows
non_numeric_mask = ~df['Total Premium Payable'].str.replace('.', '', 1).str.isdigit()

# Create separate DataFrames for numeric and non-numeric rows
non_numeric_df = df[non_numeric_mask]
numeric_df = df[~non_numeric_mask]

# Step 5: Save Numeric and Non-Numeric Data Separately
numeric_df.to_sql('cleanedprem_test_data_06_12', con=engine, if_exists='replace', index=False)

non_numeric_df.to_csv('non_numeric_total_premium_with_match.csv', index=False)

In [None]:
import pandas as pd
import numpy as np

# Creating DataFrame
df = pd.read_csv('non_numeric_total_premium_with_match.csv')

# Step 1: Remove rows where all three (Total OD Premium, Total TP Premium, gst) are zero or null
df = df[~((df["Total OD Premium"].fillna(0) == 0) &
          (df["Total TP Premium"].fillna(0) == 0) &
          (df["gst"].fillna(0) == 0))]

# Step 2: Calculate gst if it is zero and round the value
df["gst"] = df["gst"].fillna(0)  
df.loc[df["gst"] == 0, "gst"] = ((df["Total OD Premium"].fillna(0) + df["Total TP Premium"].fillna(0)) * 0.18).round()

# Step 3: Calculate Total Premium Payable as the sum of Total OD Premium, Total TP Premium, and gst
df["Total Premium Payable"] = (
    df["Total OD Premium"].fillna(0) + df["Total TP Premium"].fillna(0) + df["gst"]
).round()

df.to_csv('cleaned_non_numeric_total_premium.csv', index=False)

In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL (existing table)
existing_data_query = "SELECT * FROM cleanedprem_test_data_06_12;"
existing_data = pd.read_sql(existing_data_query, con=engine)

# Step 2: Load cleaned data from CSV
cleaned_data = pd.read_csv('cleaned_non_numeric_total_premium.csv')

# Step 3: Ensure columns match
# Add missing columns to cleaned_data
for column in existing_data.columns:
    if column not in cleaned_data.columns:
        cleaned_data[column] = None

# Align column order to match the existing table
cleaned_data = cleaned_data[existing_data.columns]

# Step 4: Append the cleaned data to the existing table
merged_data = pd.concat([existing_data, cleaned_data], ignore_index=True)

# Step 5: Save the merged data back to PostgreSQL
merged_data.to_sql('overall_cleaned_policy_level_data', con=engine, if_exists='replace', index=False)

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Step 2: Ensure relevant columns are numeric
numeric_columns = ["Total OD Premium", "Total TP Premium", "gst", "Total Premium Payable"]
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, set errors to NaN

# Step 3: Remove rows where all specified columns are zero or null
df = df[~((df["Total OD Premium"].fillna(0) == 0) &
          (df["Total TP Premium"].fillna(0) == 0) &
          (df["gst"].fillna(0) == 0) &
          (df["Total Premium Payable"].fillna(0) == 0))]

# Step 4: Identify rows where Total Premium Payable is 0
rows_with_zero_premium = df["Total Premium Payable"].fillna(0) == 0

# Step 5: Check and calculate gst for those rows where gst is also 0
df.loc[rows_with_zero_premium & (df["gst"].fillna(0) == 0), "gst"] = (
    (df["Total OD Premium"].fillna(0) + df["Total TP Premium"].fillna(0)) * 0.18
).round()

# Step 6: Calculate Total Premium Payable for rows where Total Premium Payable is 0
df.loc[rows_with_zero_premium, "Total Premium Payable"] = (
    df["Total OD Premium"].fillna(0) +
    df["Total TP Premium"].fillna(0) +
    df["gst"]
).round()

# Step 7: Save the cleaned DataFrame to PostgreSQL
table_name = "overall_cleaned_policy_level_data(with prem)"  # Specify the table name where you want to save the data
df.to_sql(table_name, con=engine, if_exists='replace', index=False)

359

In [1]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Retrieve Policy No from PostgreSQL
sql_query = 'SELECT DISTINCT "Policy No" FROM cleanedprem_merged_base_data;'
sql_data = pd.read_sql(sql_query, con=engine)

# Step 2: Read the Excel file
excel_data = pd.read_excel("cleaned_PR dataset - Copy.xlsx", usecols=["Policy No"])

# Step 3: Find matching policies
# Convert to sets for easy comparison
sql_policy_set = set(sql_data["Policy No"])
excel_policy_set = set(excel_data["Policy No"])

# Find intersection (matching policies)
matching_policies = sql_policy_set.intersection(excel_policy_set)

# Step 4: Count matches
matching_count = len(matching_policies)

# Display the result
print(f"The number of matching Policy Nos between both sources is: {matching_count}")

The number of matching Policy Nos between both sources is: 156406


In [1]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Retrieve all Policy No from PostgreSQL
sql_query = 'SELECT "Policy No" FROM cleanedprem_merged_base_data;'
sql_data = pd.read_sql(sql_query, con=engine)

# Step 2: Read the Excel file
excel_data = pd.read_excel("cleaned_PR dataset - Copy.xlsx", usecols=["Policy No"])

# Step 3: Find matching policies (including duplicates)
# Convert to sets for matching unique Policy No
sql_policy_set = set(sql_data["Policy No"])
excel_policy_set = set(excel_data["Policy No"])

# Find intersection (matching Policy Nos)
matching_policies = sql_policy_set.intersection(excel_policy_set)

# Filter rows from Excel data that match the Policy Nos
matching_rows = excel_data[excel_data["Policy No"].isin(matching_policies)]

# Get the total count of matching rows (including duplicates)
matching_count = matching_rows.shape[0]

# Step 4: Retrieve the total row count in the cleanedprem_merged_base_data table
total_row_query = 'SELECT COUNT(*) FROM cleanedprem_merged_base_data;'
existing_row_count = pd.read_sql(total_row_query, con=engine).iloc[0, 0]

# Step 5: Calculate the total data points (existing + matching count)
total_data_points = existing_row_count + matching_count

# Display the results
print(f"The total number of matching Policy Nos (including duplicates) is: {matching_count}")
print(f"The total number of data points after accounting for matching rows would be: {total_data_points}")

The total number of matching Policy Nos (including duplicates) is: 156406
The total number of data points after accounting for matching rows would be: 1307854


In [2]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Load the data from PostgreSQL table
query = 'SELECT "Policy No", "Type" FROM test_data_12_12;'
df = pd.read_sql(query, con=engine)

# Filter Policy No by Type
type_a_policies = set(df.loc[df["Type"] == 'A', "Policy No"])
type_b_policies = set(df.loc[df["Type"] == 'B', "Policy No"])

# Check if any Policy No for Type B is missing in Type A
if type_b_policies - type_a_policies:
    result = 'Some Policy No for Type B are missing in Type A'
else:
    result = 'All Policy No for Type B are present in Type A'

# Display the result
print(result)

All Policy No for Type B are present in Type A


In [2]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Retrieve Policy No from PostgreSQL
# Query all data and store it in a DataFrame
sql_query = 'SELECT * FROM "overall_policy_level_data_EF";'
sql_data = pd.read_sql(sql_query, con=engine)

# Step 2: Read the Excel file
# Read all data from the Excel file
excel_data = pd.read_excel("2024 PR.xlsx")

# Step 3: Find matching policies
# Convert Policy No columns to sets for easy comparison
sql_policy_set = set(sql_data["Policy No"])
excel_policy_set = set(excel_data["Policy No"])

# Find intersection (matching policies)
matching_policies = sql_policy_set.intersection(excel_policy_set)

# Filter matching data from both sources
matched_sql_data = sql_data[sql_data["Policy No"].isin(matching_policies)]
matched_excel_data = excel_data[excel_data["Policy No"].isin(matching_policies)]

# Step 4: Save matched data to separate CSV files
matched_sql_data.to_csv("matched_policies_from_db.csv", index=False)
matched_excel_data.to_csv("matched_policies_from_excel.csv", index=False)

# Step 5: Count matches
matching_count = len(matching_policies)

# Display the result
print(f"The number of matching Policy Nos between both sources is: {matching_count}")
print("Matched policy data from the database saved to 'matched_policies_from_db.csv'.")
print("Matched policy data from the Excel file saved to 'matched_policies_from_excel.csv'.")

The number of matching Policy Nos between both sources is: 11455
Matched policy data from the database saved to 'matched_policies_from_db.csv'.
Matched policy data from the Excel file saved to 'matched_policies_from_excel.csv'.


In [4]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Retrieve Policy Data from PostgreSQL
sql_query = 'SELECT * FROM "overall_policy_level_data_EF";'
sql_data = pd.read_sql(sql_query, con=engine)

# Step 2: Read the Excel File
excel_data = pd.read_csv("unique_rows(claim).csv")

# Step 3: Ensure Column Consistency
sql_data.columns = sql_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.strip()

# Step 4: Standardize Date Formats to 'YYYY-MM-DD'
sql_data["Policy Start Date"] = pd.to_datetime(sql_data["Policy Start Date"], errors='coerce').dt.strftime('%Y-%m-%d')
sql_data["Policy End Date"] = pd.to_datetime(sql_data["Policy End Date"], errors='coerce').dt.strftime('%Y-%m-%d')
excel_data["Policy Start Date"] = pd.to_datetime(excel_data["Policy Start Date"], errors='coerce').dt.strftime('%Y-%m-%d')
excel_data["Policy End Date"] = pd.to_datetime(excel_data["Policy End Date"], errors='coerce').dt.strftime('%Y-%m-%d')

# Step 5: Concatenate Columns for Matching
sql_data["Policy_Key"] = sql_data["Policy No"] + "_" + sql_data["Policy Start Date"] + "_" + sql_data["Policy End Date"]
excel_data["Policy_Key"] = excel_data["Policy No"] + "_" + excel_data["Policy Start Date"] + "_" + excel_data["Policy End Date"]

# Step 6: Find Matches
matched_keys = set(sql_data["Policy_Key"]).intersection(set(excel_data["Policy_Key"]))

# Step 7: Filter Matched Rows
matched_sql_data = sql_data[sql_data["Policy_Key"].isin(matched_keys)].drop(columns=["Policy_Key"])
matched_excel_data = excel_data[excel_data["Policy_Key"].isin(matched_keys)].drop(columns=["Policy_Key"])

# Step 8: Save Matched Data to Separate CSV Files
matched_sql_data.to_csv("matched_policies_from_db_claim.csv", index=False)
matched_excel_data.to_csv("matched_policies_from_excel_claim.csv", index=False)

# Step 9: Count Matches
matching_count = len(matched_sql_data)

# Display the Result
print(f"The number of exact matches (Policy No, Policy Start Date, and Policy End Date) between both sources is: {matching_count}")
print("Matched policy data from the database saved to 'matched_policies_from_db_claim.csv'.")
print("Matched policy data from the Excel file saved to 'matched_policies_from_excel_claim.csv'.")

  excel_data = pd.read_csv("unique_rows(claim).csv")
  excel_data["Policy Start Date"] = pd.to_datetime(excel_data["Policy Start Date"], errors='coerce').dt.strftime('%Y-%m-%d')
  excel_data["Policy End Date"] = pd.to_datetime(excel_data["Policy End Date"], errors='coerce').dt.strftime('%Y-%m-%d')


The number of exact matches (Policy No, Policy Start Date, and Policy End Date) between both sources is: 183737
Matched policy data from the database saved to 'matched_policies_from_db_claim.csv'.
Matched policy data from the Excel file saved to 'matched_policies_from_excel_claim.csv'.


In [None]:
import pandas as pd
from sqlalchemy import create_engine

sql_data = pd.read_excel(" ")

# Step 2: Read the Excel file
excel_data = pd.read_excel("cleaned_PR dataset - Copy.xlsx", usecols=["Policy No"])

# Step 3: Find matching policies
# Convert to sets for easy comparison
sql_policy_set = set(sql_data["Policy No"])
excel_policy_set = set(excel_data["Policy No"])

# Find intersection (matching policies)
matching_policies = sql_policy_set.intersection(excel_policy_set)

# Step 4: Count matches
matching_count = len(matching_policies)

# Display the result
print(f"The number of matching Policy Nos between both sources is: {matching_count}")