In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = 'SELECT * FROM public.pre_table;'
df = pd.read_sql(query, con=engine)

# Step 4: Identify anomalies
group_cols = ['policy no']
columns_to_check = ['cleaned insured name', 'Cleaned Branch Name 2']

# Add a column to store non-unique column names
df['Non_unique_columns'] = ''

# Create a boolean mask for anomalies
mask = pd.Series(False, index=df.index)
grouped = df.groupby(group_cols)

for col in columns_to_check:
    # Find groups with more than one unique value
    unique_within_group = grouped[col].transform('nunique')
    col_mask = unique_within_group > 1
    mask |= col_mask
    df.loc[col_mask, 'Non_unique_columns'] += col + ', '

# Remove trailing commas from 'Non_unique_columns'
df['Non_unique_columns'] = df['Non_unique_columns'].str.rstrip(', ')

# Step 5: Separate anomalous and correct data
anomalous_data = df[mask]
correct_data = df[~mask]

# Step 6: Save the results
# Save anomalous data to CSV
anomalous_data.to_csv('anomalous_data.csv', index=False)

# Save correct data to the database or a CSV
correct_data.to_sql('corrected_pre_table', engine, if_exists='replace', index=False)

# Display summary
print(f"Anomalous data saved to 'anomalous_data.csv' with {len(anomalous_data)} rows.")
print(f"Correct data saved to 'corrected_merged_data' table in the database with {len(correct_data)} rows.")

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import Text, Integer, Float, DateTime

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load policy data from PostgreSQL
policy_query = 'SELECT * FROM mapoldpolicy_handled_bookedcase_base_pr;'
policy_data = pd.read_sql(policy_query, con=engine)

# Step 2: Load claim data from Excel
claim_data = pd.read_csv('unique_rows(claim).csv')

# Step 3: Convert date columns to datetime format
policy_data['policy start date'] = pd.to_datetime(policy_data['policy start date'], errors='coerce')
policy_data['policy end date'] = pd.to_datetime(policy_data['policy end date'], errors='coerce')
claim_data['Policy Start Date_claim'] = pd.to_datetime(claim_data['Policy Start Date_claim'], errors='coerce')
claim_data['Policy End Date_claim'] = pd.to_datetime(claim_data['Policy End Date_claim'], errors='coerce')

# Step 4: Merge the datasets on the specified columns
merged_data = pd.merge(
    policy_data,
    claim_data,
    how='left',  # Use 'left' to keep all rows in the policy data
    left_on=['policy no', 'policy start date', 'policy end date'],
    right_on=['Policy No (Str)', 'Policy Start Date_claim', 'Policy End Date_claim'],
    suffixes=('_policy', '_claim')  # Avoid column conflicts
)

# Step 5: Map pandas dtypes to SQLAlchemy types dynamically
def map_column_types(df):
    dtype_mapping = {}
    for col in df.columns:
        if pd.api.types.is_integer_dtype(df[col]):
            dtype_mapping[col] = Integer
        elif pd.api.types.is_float_dtype(df[col]):
            dtype_mapping[col] = Float
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            dtype_mapping[col] = DateTime
        elif pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_string_dtype(df[col]):
            dtype_mapping[col] = Text
        else:
            dtype_mapping[col] = Text  # Default to Text for unsupported types
    return dtype_mapping

dtype_mapping = map_column_types(merged_data)

# Step 6: Save the merged data to PostgreSQL
merged_table_name = 'overallcorrected_base_pr_claim'
merged_data.to_sql(
    merged_table_name,
    con=engine,
    if_exists='replace',
    index=False,
    dtype=dtype_mapping
)

# Step 7: Save the merged data to a CSV file (optional)
# merged_data.to_csv('merged_policy_claim_data(Liberty).csv', index=False)

# Display success message
print(f"Merged data has been saved to the '{merged_table_name}' table in the database.")

  claim_data = pd.read_csv('unique_rows(claim).csv')
  claim_data['Policy Start Date_claim'] = pd.to_datetime(claim_data['Policy Start Date_claim'], errors='coerce')
  claim_data['Policy End Date_claim'] = pd.to_datetime(claim_data['Policy End Date_claim'], errors='coerce')


Merged data has been saved to the 'overallcorrected_base_pr_claim' table in the database.


In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = 'SELECT * FROM public.corrected_pre_table_claim;'
df = pd.read_sql(query, con=engine)


df['CustomerID_Base'] = (df['cleaned insured name'].astype(str) + '_' +
                         df['Cleaned Branch Name 2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['policy start date'] = pd.to_datetime(df['policy start date'], errors='coerce')
df['policy end date'] = pd.to_datetime(df['policy end date'], errors='coerce')


# policy_status_map = {'0.0': 'Not Renewed', '1.0': 'Renewed', '2.0' : 'Open'}
# df['Policy Status'] = df['upd_booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['policy end date'].dt.year - df['policy start date'].dt.year) * 12 +
                             (df['policy end date'].dt.month - df['policy start date'].dt.month))

# Calculate policy tenure in years (rounded)
df['Policy Tenure'] = (df['Policy Tenure Month'] / 12).round(0)

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['policy start date'].dt.year

# Extract the year from start and end dates for other calculations
df['End Year'] = df['policy end date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date, max end date, and yearly tenure
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'policy start date': 'min', 'policy end date': 'max'})
    .reset_index()
)

# Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['policy end date'].dt.year - yearly_tenure['policy start date'].dt.year) * 12 +
    (yearly_tenure['policy end date'].dt.month - yearly_tenure['policy start date'].dt.month)
)

# Step 4: Calculate Cumulative Tenure
yearly_tenure['Cumulative Tenure (Months)'] = (
    yearly_tenure.groupby('CustomerID')['Yearly Tenure (Months)']
    .cumsum()
)

# Convert Cumulative Tenure to years and calculate Customer Tenure
yearly_tenure['Tenure Decimal'] = yearly_tenure['Cumulative Tenure (Months)'] / 12
yearly_tenure['Customer Tenure'] = yearly_tenure['Tenure Decimal'].round(0)

# Select relevant columns for mapping back to original data
tenure_mapping = yearly_tenure[['CustomerID', 'Start Year', 'Cumulative Tenure (Months)', 'Tenure Decimal', 'Customer Tenure']]

# Step 5: Map back to the original data
df = df.merge(tenure_mapping, on=['CustomerID', 'Start Year'], how='left')

# Step 8: Add New Customers column
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 10: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

from sqlalchemy import text

# Save the processed data into PostgreSQL
processed_table_name = 'overall_cleaned_base_and_pr_ef'  # Target table name

# Create a connection to the database
with engine.connect() as connection:
    # Drop the table if it exists
    drop_query = f"DROP TABLE IF EXISTS {processed_table_name};"
    connection.execute(text(drop_query))  # Execute the drop statement
    print(f"Table {processed_table_name} dropped successfully.")

    # Load the new data into the table
    df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)
    print(f"Data loaded into {processed_table_name} successfully.")

In [1]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = 'SELECT * FROM public.overallcorrected_base_pr_claim;'
df = pd.read_sql(query, con=engine)


df['CustomerID_Base'] = (df['corrected_name'].astype(str) + '_' +
                         df['Cleaned Branch Name 2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['policy start date'] = pd.to_datetime(df['policy start date'], errors='coerce')
df['policy end date'] = pd.to_datetime(df['policy end date'], errors='coerce')


# policy_status_map = {'0.0': 'Not Renewed', '1.0': 'Renewed', '2.0' : 'Open'}
# df['Policy Status'] = df['upd_booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['policy end date'].dt.year - df['policy start date'].dt.year) * 12 +
                             (df['policy end date'].dt.month - df['policy start date'].dt.month))

# Calculate policy tenure in years (rounded)
df['Policy Tenure'] = (df['Policy Tenure Month'] / 12).round(0)

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['policy start date'].dt.year

# Extract the year from start and end dates for other calculations
df['End Year'] = df['policy end date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date, max end date, and yearly tenure
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'policy start date': 'min', 'policy end date': 'max'})
    .reset_index()
)

# Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['policy end date'].dt.year - yearly_tenure['policy start date'].dt.year) * 12 +
    (yearly_tenure['policy end date'].dt.month - yearly_tenure['policy start date'].dt.month)
)

# Step 4: Calculate Cumulative Tenure
yearly_tenure['Cumulative Tenure (Months)'] = (
    yearly_tenure.groupby('CustomerID')['Yearly Tenure (Months)']
    .cumsum()
)

# Convert Cumulative Tenure to years and calculate Customer Tenure
yearly_tenure['Tenure Decimal'] = yearly_tenure['Cumulative Tenure (Months)'] / 12
yearly_tenure['Customer Tenure'] = yearly_tenure['Tenure Decimal'].round(0)

# Select relevant columns for mapping back to original data
tenure_mapping = yearly_tenure[['CustomerID', 'Start Year', 'Cumulative Tenure (Months)', 'Tenure Decimal', 'Customer Tenure']]

# Step 5: Map back to the original data
df = df.merge(tenure_mapping, on=['CustomerID', 'Start Year'], how='left')

# Step 8: Add New Customers column
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 10: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

from sqlalchemy import text

# # Save the processed data into PostgreSQL
# processed_table_name = 'overall_cleaned_base_and_pr_ef'  # Target table name

# # Create a connection to the database
# with engine.connect() as connection:
#     # Drop the table if it exists
#     drop_query = f"DROP TABLE IF EXISTS {processed_table_name};"
#     connection.execute(text(drop_query))  # Execute the drop statement
#     print(f"Table {processed_table_name} dropped successfully.")

#     # Load the new data into the table
#     df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)
#     print(f"Data loaded into {processed_table_name} successfully.")

# Save correct data to the database or a CSV
df.to_sql('overall_cleaned_base_and_pr_ef', engine, if_exists='replace', index=False)

150