In [1]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from sqlalchemy import create_engine
from fuzzywuzzy import fuzz, process  # Ensure you have fuzzywuzzy installed: pip install fuzzywuzzy

# ---------------------------
# Step 0: Database Connection
# ---------------------------
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# ---------------------------
# Step 1: Load Data and Create Key Column
# ---------------------------
# Adjusting the ORDER BY clause to reflect your column names.
query = """
SELECT * FROM public.cleancus_overallcleaned_chessis_engine
ORDER BY "Cleaned Chassis Number", "Cleaned Engine Number", "Cleaned insured name_filled", "policy start date";
"""
df = pd.read_sql(query, con=engine)



In [2]:
# Create chassis_engine_key by concatenating "Cleaned Chassis Number" and "Cleaned Engine Number"
df['chassis_engine_key'] = df['Cleaned Chassis Number'].astype(str) + '_' + df['Cleaned Engine Number'].astype(str)

# Sort the DataFrame by chassis_engine_key and policy start date to ensure sequential processing
df.sort_values(["chassis_engine_key", "policy start date"], inplace=True)

# ---------------------------
# Step 2: Sequential Name Correction
# ---------------------------
# Initialize previous name tracker
prev_name = None
prev_chassis = None

corrected_names = []
similarity_scores = []

# Iterate over rows sequentially
for index, row in df.iterrows():
    # Use the column "Cleaned insured name_filled"
    current_name = row["Cleaned insured name_filled"]
    chassis_engine_key = row["chassis_engine_key"]

    # Handle potential null values by converting them to an empty string
    if pd.isnull(current_name):
        current_name = ""
    
    # Check if we are still within the same chassis_engine_key group
    if prev_name is not None and prev_chassis == chassis_engine_key:
        similarity = fuzz.ratio(prev_name, current_name)
        if similarity >= 80:  # If similar, use the previous (corrected) name
            corrected_names.append(prev_name)
        else:
            corrected_names.append(current_name)
    else:
        # First record for this chassis_engine_key; keep the original name
        corrected_names.append(current_name)

    # Calculate the similarity score between the corrected name and the current name
    similarity_scores.append(fuzz.ratio(corrected_names[-1], current_name))
    
    # Update previous values
    prev_name = corrected_names[-1]
    prev_chassis = chassis_engine_key

# Add the new columns to the DataFrame
df["corrected_name"] = corrected_names
df["name_similarity"] = similarity_scores

In [3]:

# ---------------------------
# Step 3: Write the Corrected Data to a New Table
# ---------------------------
new_table_name = "corrected_cleancus_overallcleaned_chessis_engine"

# Write the DataFrame to the new table (if_exists="replace" will overwrite an existing table)
df.to_sql(new_table_name, engine, if_exists="replace", index=False)

print(f"Data successfully loaded into {new_table_name}")

Data successfully loaded into corrected_cleancus_overallcleaned_chessis_engine


In [None]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from sqlalchemy import create_engine
from fuzzywuzzy import fuzz, process  # Ensure you have fuzzywuzzy installed: pip install fuzzywuzzy

# ---------------------------
# Step 0: Database Connection
# ---------------------------
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# ---------------------------
# Step 1: Load Data and Create Key Column
# ---------------------------
# Adjusting the ORDER BY clause to reflect your column names.
query = """
SELECT * FROM public.cleancus_overallcleaned_chessis_engine
ORDER BY "Cleaned Chassis Number", "Cleaned Engine Number", "Cleaned insured name_filled", "policy start date";
"""
df = pd.read_sql(query, con=engine)

In [None]:
# Create chassis_engine_key by concatenating "Cleaned Chassis Number" and "Cleaned Engine Number"
df['chassis_engine_key'] = df['Cleaned Chassis Number'].astype(str) + '_' + df['Cleaned Engine Number'].astype(str)

# Sort the DataFrame by chassis_engine_key and policy start date to ensure sequential processing
df.sort_values(["chassis_engine_key", "policy start date"], inplace=True)

# ---------------------------
# Step 2: Sequential Name Correction
# ---------------------------
# Initialize previous name tracker
prev_name = None
prev_chassis = None

corrected_names = []
similarity_scores = []

# Iterate over rows sequentially
for index, row in df.iterrows():
    # Use the column "Cleaned insured name_filled"
    current_name = row["Cleaned insured name_filled"]
    chassis_engine_key = row["chassis_engine_key"]

    # Handle potential null values by converting them to an empty string
    if pd.isnull(current_name):
        current_name = ""
    
    # Check if we are still within the same chassis_engine_key group
    if prev_name is not None and prev_chassis == chassis_engine_key:
        similarity = fuzz.ratio(prev_name, current_name)
        if similarity >= 80:  # If similar, use the previous (corrected) name
            corrected_names.append(prev_name)
        else:
            corrected_names.append(current_name)
    else:
        # First record for this chassis_engine_key; keep the original name
        corrected_names.append(current_name)

    # Calculate the similarity score between the corrected name and the current name
    similarity_scores.append(fuzz.ratio(corrected_names[-1], current_name))
    
    # Update previous values
    prev_name = corrected_names[-1]
    prev_chassis = chassis_engine_key

# Add the new columns to the DataFrame
df["corrected_name"] = corrected_names
df["name_similarity"] = similarity_scores