In [1]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Retrieve Policy Data from PostgreSQL
sql_query = 'SELECT * FROM "overall_policy_level_data_EF";'
sql_data = pd.read_sql(sql_query, con=engine)

# Step 2: Read the Excel File
excel_data = pd.read_excel("2024 PR.xlsx")

# Step 3: Ensure Column Consistency
sql_data.columns = sql_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.strip()

# Step 4: Convert Date Columns to Datetime Format
# Only convert the dates we need for comparison
sql_data["Policy End Date"] = pd.to_datetime(sql_data["Policy End Date"], errors='coerce')
sql_data["Policy Start Date"] = pd.to_datetime(sql_data["Policy Start Date"], errors='coerce')

excel_data["Policy Issue date"] = pd.to_datetime(excel_data["Policy Issue date"], format='%d-%m-%Y', errors='coerce')
excel_data["Policy End Date"] = pd.to_datetime(excel_data["Policy End Date"], format='%d-%m-%Y', errors='coerce')
excel_data["Policy Start Date"] = pd.to_datetime(excel_data["Policy Start Date"], format='%d-%m-%Y', errors='coerce')

# Step 5: Compare Records Without Merging
matched_sql_records = []
matched_excel_records = []

# Iterate over each row in the Excel dataset
for _, excel_row in excel_data.iterrows():
    policy_no_excel = excel_row["Policy No"]
    policy_issue_date_excel = excel_row["Policy Issue date"]
    
    # Compare with rows in the SQL dataset
    for _, sql_row in sql_data.iterrows():
        policy_no_db = sql_row["Policy No"]
        policy_end_date_db = sql_row["Policy End Date"]
        
        # Check for matching Policy No and year-based date conditions
        if pd.notna(policy_issue_date_excel) and pd.notna(policy_end_date_db):  # Ensure dates are valid
            if (policy_no_excel == policy_no_db and 
                (policy_issue_date_excel.year == policy_end_date_db.year or
                 policy_issue_date_excel.year == policy_end_date_db.year + 1)):
                matched_sql_records.append(sql_row.to_dict())
                matched_excel_records.append(excel_row.to_dict())

# Convert matched records to DataFrames
matched_sql_data = pd.DataFrame(matched_sql_records)
matched_excel_data = pd.DataFrame(matched_excel_records)

# Step 6: Save Matched Records to Separate CSV Files
matched_sql_data.to_csv("matched_policies_from_db_claim.csv", index=False)
matched_excel_data.to_csv("matched_policies_from_excel_claim.csv", index=False)

# Step 7: Count Matches
matching_count = len(matched_sql_data)

# Display the Result
print(f"The number of matched records based on the year logic is: {matching_count}")
print("Matched policy data from the database saved to 'matched_policies_from_db_claim.csv'.")
print("Matched policy data from the Excel file saved to 'matched_policies_from_excel_claim.csv'.")

KeyboardInterrupt: 

In [2]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Retrieve Policy Data from PostgreSQL
sql_query = 'SELECT * FROM "overall_policy_level_data_EF";'
sql_data = pd.read_sql(sql_query, con=engine)

# Step 2: Read the Excel File
excel_data = pd.read_excel("2024 PR.xlsx")

# Step 3: Ensure Column Consistency
sql_data.columns = sql_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.strip()

# Step 4: Convert Date Columns to Datetime Format
sql_data["Policy End Date"] = pd.to_datetime(sql_data["Policy End Date"], errors='coerce')
sql_data["Policy Start Date"] = pd.to_datetime(sql_data["Policy Start Date"], errors='coerce')

excel_data["Policy Issue date"] = pd.to_datetime(excel_data["Policy Issue date"], format='%d-%m-%Y', errors='coerce')
excel_data["Policy End Date"] = pd.to_datetime(excel_data["Policy End Date"], format='%d-%m-%Y', errors='coerce')
excel_data["Policy Start Date"] = pd.to_datetime(excel_data["Policy Start Date"], format='%d-%m-%Y', errors='coerce')

# Step 5: Perform Vectorized Matching
# Expand the SQL dataset for a cross-join
sql_data_expanded = sql_data.assign(key=1)
excel_data_expanded = excel_data.assign(key=1)

# Perform a cross-join between SQL and Excel datasets
merged_data = pd.merge(sql_data_expanded, excel_data_expanded, on="key").drop(columns="key")

# Apply vectorized logic for matching
matched_data = merged_data[
    (merged_data["Policy No_x"] == merged_data["Policy No_y"]) &  # Match Policy No
    (
        (merged_data["Policy Issue date"].dt.year == merged_data["Policy End Date_x"].dt.year) |  # Same year
        (merged_data["Policy Issue date"].dt.year == merged_data["Policy End Date_x"].dt.year + 1)  # Next year
    )
]

# Step 6: Separate Matches into SQL and Excel DataFrames
matched_sql_data = matched_data[
    ["Policy No_x", "Policy Start Date_x", "Policy End Date_x"]
].drop_duplicates().rename(
    columns={"Policy No_x": "Policy No", "Policy Start Date_x": "Policy Start Date", "Policy End Date_x": "Policy End Date"}
)

matched_excel_data = matched_data[
    ["Policy No_y", "Policy Start Date_y", "Policy End Date_y", "Policy Issue date"]
].drop_duplicates().rename(
    columns={"Policy No_y": "Policy No", "Policy Start Date_y": "Policy Start Date", "Policy End Date_y": "Policy End Date"}
)

# Step 7: Save Matched Records to Separate CSV Files
matched_sql_data.to_csv("matched_policies_from_db_claim.csv", index=False)
matched_excel_data.to_csv("matched_policies_from_excel_claim.csv", index=False)

# Step 8: Count Matches
matching_count = len(matched_sql_data)

# Display the Result
print(f"The number of matched records based on the year logic is: {matching_count}")
print("Matched policy data from the database saved to 'matched_policies_from_db_claim.csv'.")
print("Matched policy data from the Excel file saved to 'matched_policies_from_excel_claim.csv'.")

MemoryError: Unable to allocate 9.11 TiB for an array with shape (1251525430064,) and data type int64