In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',         
    'database': 'postgres',
    'user': 'postgres',        
    'password': 'abc', 
    'port': '5432'               
}

# Create connection string
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Fetch data from the database
query = "SELECT * FROM merged_2023_2024_base;" 
df = pd.read_sql(query, con=engine)

In [None]:
# Step 2: Convert relevant columns to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'])

# Step 3: Identify duplicates based on specific columns
duplicates = df[df.duplicated(subset=['Policy Number', 'Policy Start Date', 'Policy End Date'], keep=False)]

# Step 4: Prioritize duplicates based on null values and BOOKED column
def prioritize_rows(group):
    # Add a null count column and sort by it and BOOKED
    group = group.assign(null_count=group.isnull().sum(axis=1))
    group = group.sort_values(by=['null_count', 'BOOKED'], ascending=[True, False])
    return group.iloc[0]

cleaned_duplicates = duplicates.groupby(['Policy Number', 'Policy Start Date', 'Policy End Date']).apply(prioritize_rows).reset_index(drop=True)

# Step 5: Remove duplicates from the original dataframe
df_cleaned = pd.concat([df, cleaned_duplicates]).drop_duplicates(keep=False)

# Step 6: Combine removed duplicates for the output file
removed_rows = pd.concat([df, df_cleaned]).drop_duplicates(keep=False)

# Step 7: Save results to CSV files (optional)
removed_rows.to_csv('duplicates_data (Merged data).csv', index=False)
df_cleaned.to_csv('cleaned_Merged_Base_dataset.csv', index=False)

# Optional Step: Save cleaned data back to the database
df_cleaned.to_sql('cleaned_Merged_Base_Data', con=engine, if_exists='replace', index=False)  

In [4]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',         
    'database': 'postgres',
    'user': 'postgres',        
    'password': 'abc', 
    'port': '5432'               
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Fetch data
query = "SELECT * FROM merged_2023_2024_base;" 
df = pd.read_sql(query, con=engine)

# Step 2: Convert dates
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'])

# Step 3: Identify duplicates
duplicates = df[df.duplicated(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)]

# Step 4: Prioritize duplicates
def prioritize_rows(group):
    group = group.assign(null_count=group.isnull().sum(axis=1))
    group = group.sort_values(by=['null_count', 'BOOKED'], ascending=[True, False])
    return group.iloc[0]

cleaned_duplicates = duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date']).apply(prioritize_rows).reset_index(drop=True)

# Step 5: Remove duplicates from the original dataframe
df_cleaned = df.loc[~df.index.isin(cleaned_duplicates.index)]

# Step 6: Combine removed duplicates for the output
removed_rows = pd.concat([duplicates, cleaned_duplicates]).drop_duplicates(keep=False)

# Debug final counts
print(f"Initial row count: {len(df)}")
print(f"Duplicate row count: {len(duplicates)}")
print(f"Cleaned row count: {len(df_cleaned)}")
print(f"Removed row count: {len(removed_rows)}")
# Step 7: Save results to CSV files (optional)
# removed_rows.to_csv('duplicates_data (Merged data) 1.csv', index=False)
# df_cleaned.to_csv('cleaned_Merged_Base_dataset 1.csv', index=False)

# Optional Step: Save cleaned data back to the database
df_cleaned.to_sql('cleaned_merged_base_data_check1', con=engine, if_exists='replace', index=False)

  cleaned_duplicates = duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date']).apply(prioritize_rows).reset_index(drop=True)


Initial row count: 1529265
Duplicate row count: 737497
Cleaned row count: 1160517
Removed row count: 1106243


53

In [5]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',         
    'database': 'postgres',
    'user': 'postgres',        
    'password': 'abc', 
    'port': '5432'               
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Fetch data
query = "SELECT * FROM merged_2023_2024_base;" 
df = pd.read_sql(query, con=engine)

# Step 2: Convert dates to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'])

# Step 3: Identify duplicates
duplicates = df[df.duplicated(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)]

# Step 4: Prioritize duplicates
def prioritize_rows(group):
    # Add null count column for prioritization
    group = group.assign(null_count=group.isnull().sum(axis=1))
    # Sort by null_count (ascending) and BOOKED (descending)
    group = group.sort_values(by=['null_count', 'BOOKED'], ascending=[True, False])
    return group.iloc[0]  # Return the first row after sorting

# Apply prioritization logic to each duplicate group
cleaned_duplicates = (
    duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)

# Step 5: Ensure uniqueness in the cleaned dataset
df_cleaned = pd.concat([df, cleaned_duplicates]).drop_duplicates(
    subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep='last'
)

# Step 6: Combine removed duplicates for the output
removed_rows = pd.concat([df, df_cleaned]).drop_duplicates(keep=False)

# Debug row counts
print(f"Initial row count: {len(df)}")
print(f"Duplicate row count: {len(duplicates)}")
print(f"Cleaned unique row count: {len(df_cleaned)}")
print(f"Removed row count: {len(removed_rows)}")

# Optional Step: Save cleaned data back to database
df_cleaned.to_sql('cleaned_merged_base_data_check', con=engine, if_exists='replace', index=False)

  .apply(prioritize_rows)


Initial row count: 1529265
Duplicate row count: 737497
Cleaned unique row count: 1160516
Removed row count: 1106243


356