In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = 'SELECT * FROM public.cleaned_overall_merged_base_pr_data;'
df = pd.read_sql(query, con=engine)

# Step 2: Define a function to clean text
def clean_name(name):
    if pd.isna(name):
        return ''
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Step 3: Apply cleaning function to the necessary columns
df['Cleaned_Insured name'] = df['Insured name '].apply(clean_name)
df['Cleaned_New Branch Name 2'] = df['New Branch Name  2'].apply(clean_name)
df['Cleaned_state2'] = df['state2'].apply(clean_name)

# Step 4: Identify anomalies
group_cols = ['Policy No']
columns_to_check = ['Cleaned_Insured name', 'Cleaned_New Branch Name 2']

# Add a column to store non-unique column names
df['Non_unique_columns'] = ''

# Create a boolean mask for anomalies
mask = pd.Series(False, index=df.index)
grouped = df.groupby(group_cols)

for col in columns_to_check:
    # Find groups with more than one unique value
    unique_within_group = grouped[col].transform('nunique')
    col_mask = unique_within_group > 1
    mask |= col_mask
    df.loc[col_mask, 'Non_unique_columns'] += col + ', '

# Remove trailing commas from 'Non_unique_columns'
df['Non_unique_columns'] = df['Non_unique_columns'].str.rstrip(', ')

# Step 5: Separate anomalous and correct data
anomalous_data = df[mask]
correct_data = df[~mask]

# Step 6: Save the results
# Save anomalous data to CSV
anomalous_data.to_csv('anomalous_data.csv', index=False)

# Save correct data to the database or a CSV
correct_data.to_sql('corrected_merged_data', engine, if_exists='replace', index=False)

# Display summary
print(f"Anomalous data saved to 'anomalous_data.csv' with {len(anomalous_data)} rows.")
print(f"Correct data saved to 'corrected_merged_data' table in the database with {len(correct_data)} rows.")

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import Text, Integer, Float, DateTime

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load policy data from PostgreSQL
policy_query = 'SELECT * FROM public.corrected_merged_data;'
policy_data = pd.read_sql(policy_query, con=engine)

# Step 2: Load claim data from Excel
claim_data = pd.read_excel('unique_rows(claim).xlsx')

# Step 3: Convert date columns to datetime format
policy_data['Policy Start Date'] = pd.to_datetime(policy_data['Policy Start Date'], errors='coerce')
policy_data['Policy End Date'] = pd.to_datetime(policy_data['Policy End Date'], errors='coerce')
claim_data['Policy Start Date_claim'] = pd.to_datetime(claim_data['Policy Start Date_claim'], errors='coerce')
claim_data['Policy End Date_claim'] = pd.to_datetime(claim_data['Policy End Date_claim'], errors='coerce')

# Step 4: Merge the datasets on the specified columns
merged_data = pd.merge(
    policy_data,
    claim_data,
    how='left',  # Use 'left' to keep all rows in the policy data
    left_on=['Policy No', 'Policy Start Date', 'Policy End Date'],
    right_on=['Policy No (Str)', 'Policy Start Date_claim', 'Policy End Date_claim'],
    suffixes=('_policy', '_claim')  # Avoid column conflicts
)

# Step 5: Map pandas dtypes to SQLAlchemy types dynamically
def map_column_types(df):
    dtype_mapping = {}
    for col in df.columns:
        if pd.api.types.is_integer_dtype(df[col]):
            dtype_mapping[col] = Integer
        elif pd.api.types.is_float_dtype(df[col]):
            dtype_mapping[col] = Float
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            dtype_mapping[col] = DateTime
        elif pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_string_dtype(df[col]):
            dtype_mapping[col] = Text
        else:
            dtype_mapping[col] = Text  # Default to Text for unsupported types
    return dtype_mapping

dtype_mapping = map_column_types(merged_data)

# Step 6: Save the merged data to PostgreSQL
merged_table_name = 'corrected_merged_claim_data'
merged_data.to_sql(
    merged_table_name,
    con=engine,
    if_exists='replace',
    index=False,
    dtype=dtype_mapping
)

# Step 7: Save the merged data to a CSV file (optional)
# merged_data.to_csv('merged_policy_claim_data(Liberty).csv', index=False)

# Display success message
print(f"Merged data has been saved to the '{merged_table_name}' table in the database.")


Merged data has been saved to the 'corrected_merged_claim_data' table in the database.


In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = 'SELECT * FROM public.corrected_merged_claim_data;'
db_data = pd.read_sql(query, con=engine)

# Load the state-to-zone mapping from CSV
zone_mapping = pd.read_csv('Indian_States_Direction_Final.csv')  # Replace with your file path

# Ensure state names in both datasets are lowercase and stripped of spaces for matching
db_data['Cleaned_state2'] = db_data['Cleaned_state2'].str.lower().str.strip()
zone_mapping['State and UT'] = zone_mapping['State and UT'].str.lower().str.strip()

# Step 2: Fill NULL values in Zone 2 based on Cleaned_state2
state_zone_dict = dict(zip(zone_mapping['State and UT'], zone_mapping['Zone']))
db_data['Zone 2'] = db_data.apply(
    lambda row: state_zone_dict.get(row['Cleaned_state2'], row['Zone 2']) if pd.isna(row['Zone 2']) else row['Zone 2'], axis=1
)

# Step 3: Resolve conflicts for the same Policy No with different Zone 2 values
db_data['Policy Start Date'] = pd.to_datetime(db_data['Policy Start Date'], errors='coerce')

# Sort by Policy No and Policy Start Date (ascending)
db_data.sort_values(by=['Policy No', 'Policy Start Date'], inplace=True)

# Identify and resolve conflicting Zone 2 values
def resolve_zone_conflicts(group):
    if group['Zone 2'].nunique() > 1:
        # Retain the Zone 2 value from the oldest Policy Start Date
        oldest_zone = group.loc[group['Policy Start Date'].idxmin(), 'Zone 2']
        group['Zone 2'] = oldest_zone
    return group

db_data = db_data.groupby('Policy No').apply(resolve_zone_conflicts)

# Step 4: Save the corrected data back to PostgreSQL
db_data.to_sql('corrected_merged_claim_data_final', con=engine, if_exists='replace', index=False)

print("The corrected data has been saved to the 'corrected_merged_claim_data_final' table in the database.")