In [3]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

# Define database connection parameters
db_params = {
    'dbname': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'host': 'localhost',
    'port': '5432'
}

excel_df = pd.read_excel('cleaned_PR dataset.xlsx')

# Rename column to match with database for easier comparison
excel_df.rename(columns={'Initial Policy No': 'Policy No'}, inplace=True)

# Create a database connection
try:
    engine = create_engine(f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}")
    with engine.connect() as conn:
        # Load Policy No from the database
        db_query = "SELECT DISTINCT \"Policy No\" FROM public.check_test_data_06_12;"
        db_df = pd.read_sql(db_query, conn)

        # Find matching Policy Nos
        matching_policies = excel_df['Policy No'].isin(db_df['Policy No'])
        match_count = matching_policies.sum()

        print(f"Number of matching Policy Nos: {match_count}")
except Exception as e:
    print(f"Error: {e}")
finally:
    engine.dispose()

Number of matching Policy Nos: 160852


In [2]:
pip install psycopg2

Collecting psycopg2
  Downloading psycopg2-2.9.10-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Downloading psycopg2-2.9.10-cp312-cp312-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
    --------------------------------------- 0.0/1.2 MB 682.7 kB/s eta 0:00:02
   ------- -------------------------------- 0.2/1.2 MB 3.5 MB/s eta 0:00:01
   ---------------- ----------------------- 0.5/1.2 MB 4.2 MB/s eta 0:00:01
   ---------------------- ----------------- 0.7/1.2 MB 3.8 MB/s eta 0:00:01
   ------------------------------ --------- 0.9/1.2 MB 4.0 MB/s eta 0:00:01
   ---------------------------------------  1.2/1.2 MB 4.3 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 3.9 MB/s eta 0:00:00
Installing collected packages: psycopg2
Successfully installed psycopg2-2.9.10
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM test_data_06_12;"
df = pd.read_sql(query, con=engine)

# Step 2: Convert dates to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'])

# Step 3: Deduplicate and prioritize
duplicates = df[df.duplicated(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)]

def prioritize_rows(group):
    group = group.assign(null_count=group.isnull().sum(axis=1))
    group = group.sort_values(by=['null_count', 'booked'], ascending=[True, False])
    return group.iloc[0]

cleaned_duplicates = (
    duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)

df_cleaned = pd.concat([df, cleaned_duplicates]).drop_duplicates(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep='last')

# Step 4: Handle NULL values in BOOKED
today = datetime.now().date()
df_cleaned['booked'] = df_cleaned.apply(
    lambda row: 0 if pd.isnull(row['booked']) and row['Policy End Date'].date() < today else ('-' if pd.isnull(row['booked']) else row['booked']),
    axis=1
)

# Step 5: Correct BOOKED values based on Type
correction_count = 0

def correct_booked(group):
    global correction_count
    type_a = group[group['Type'] == 'A']
    type_b = group[group['Type'] == 'B']
    
    # Check if both Type A and Type B exist for the same Policy No
    if not type_a.empty and not type_b.empty:
        # Check if BOOKED is 0 for Type A
        if (type_a.iloc[0]['booked'] == 0):
            correction_count += 1
            # Update BOOKED to 1 for Type A
            group.loc[group['Type'] == 'A', 'booked'] = 1
    return group

# Apply the correction function to ensure Type A rows are properly updated
type_a_b = df_cleaned[df_cleaned['Type'].isin(['A', 'B'])]
type_a_b_grouped = type_a_b.groupby('Policy No')

df_cleaned = type_a_b_grouped.apply(correct_booked).reset_index(drop=True)

# Output the number of corrections made
print(f"Number of corrections made: {correction_count}")

# Optional Step: Save the cleaned dataset back to the database
df_cleaned.to_sql('cleaned_test_data_06_12', con=engine, if_exists='replace', index=False)

  .apply(prioritize_rows)
  df_cleaned = type_a_b_grouped.apply(correct_booked).reset_index(drop=True)


Number of corrections made: 49


404

In [7]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM cleaned_test_data_06_12;"
df = pd.read_sql(query, con=engine)

# Step 2: Create a `Match` column for all rows in the dataset
# Identify `Policy No` that have corresponding rows with `Type = B`
policy_no_with_b = df[df['Type'] == 'B']['Policy No'].unique()

# Add the `Match` column to the original DataFrame
df['Match'] = df['Policy No'].apply(lambda x: 'Matches B' if x in policy_no_with_b else None)

# Step 3: Separate rows with non-numeric `Total Premium Payable`
# Function to identify non-numeric values in the `Total Premium Payable` column
def is_non_numeric(value):
    try:
        float(value)  # Attempt to convert to a float
        return False  # It's numeric
    except ValueError:
        return True  # It's non-numeric

# Add a helper column to identify non-numeric rows
df['is_non_numeric'] = df['Total Premium Payable '].apply(is_non_numeric)

# Create a DataFrame for non-numeric rows
non_numeric_df = df[df['is_non_numeric']].drop(columns=['is_non_numeric'])

# Step 4: Save the data back to PostgreSQL and export as CSV
# Save the full dataset with the `Match` column
# df.to_sql('updated_data_with_match', con=engine, if_exists='replace', index=False)

# Save the non-numeric rows to a separate table
# non_numeric_df.to_sql('non_numeric_total_premium_with_match', con=engine, if_exists='replace', index=False)

# Export to CSV for validation (optional)
# df.to_csv('updated_data_with_match.csv', index=False)
non_numeric_df.to_csv('non_numeric_total_premium_with_match.csv', index=False)

KeyboardInterrupt: 

In [10]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123', 
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM cleaned_test_data_06_12;"
df = pd.read_sql(query, con=engine)

# Step 2: Create a `Match` column for all rows in the dataset
# Identify `Policy No` that have corresponding rows with `Type = B`
policy_no_with_b = df[df['Type'] == 'B']['Policy No'].unique()

# Add the `Match` column to the original DataFrame
df['Match'] = df['Policy No'].isin(policy_no_with_b).apply(lambda x: 'Matches B' if x else None)

# Step 3: Efficiently separate rows with non-numeric `Total Premium Payable`
# Ensure column name is stripped of extra spaces
df.rename(columns=lambda x: x.strip(), inplace=True)

# Use vectorized operation to identify non-numeric values
non_numeric_mask = ~df['Total Premium Payable'].str.replace('.', '', 1).str.isdigit()

# Create a DataFrame for non-numeric rows
non_numeric_df = df[non_numeric_mask]

# Step 4: Save the data back to PostgreSQL and export as CSV
# Save the full dataset with the `Match` column
# df.to_sql('updated_data_with_match', con=engine, if_exists='replace', index=False)

# Save the non-numeric rows to a separate table
# non_numeric_df.to_sql('non_numeric_total_premium_with_match', con=engine, if_exists='replace', index=False)

# Export to CSV for validation (optional)
# df.to_csv('updated_data_with_match.csv', index=False)
non_numeric_df.to_csv('non_numeric_total_premium_with_match.csv', index=False)

TypeError: bad operand type for unary ~: 'NoneType'

In [11]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM cleaned_test_data_06_12;"
df = pd.read_sql(query, con=engine)

# Step 2: Create a `Match` column for all rows in the dataset
policy_no_with_b = df[df['Type'] == 'B']['Policy No'].unique()
df['Match'] = df['Policy No'].isin(policy_no_with_b).apply(lambda x: 'Matches B' if x else None)

# Step 3: Clean column names
df.rename(columns=lambda x: x.strip(), inplace=True)

# Step 4: Handle missing values and identify non-numeric rows
# Fill missing values with a placeholder string
df['Total Premium Payable'] = df['Total Premium Payable'].fillna('')

# Use vectorized operations to identify non-numeric rows
non_numeric_mask = ~df['Total Premium Payable'].str.replace('.', '', 1).str.isdigit()

# Create a DataFrame for non-numeric rows
non_numeric_df = df[non_numeric_mask]

# Step 5: Save the data back to PostgreSQL and export as CSV
# df.to_sql('updated_data_with_match', con=engine, if_exists='replace', index=False)
# non_numeric_df.to_sql('non_numeric_total_premium_with_match', con=engine, if_exists='replace', index=False)

# df.to_csv('updated_data_with_match.csv', index=False)
non_numeric_df.to_csv('non_numeric_total_premium_with_match.csv', index=False)

In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Retrieve Policy No from PostgreSQL
sql_query = 'SELECT DISTINCT "Policy No" FROM public.check_test_data_06_12;'
sql_data = pd.read_sql(sql_query, con=engine)

# Step 2: Read the Excel file
excel_data = pd.read_excel("cleaned_PR dataset - Copy.xlsx", usecols=["Policy No"])

# Step 3: Combine and find distinct Policy Nos
combined_data = pd.concat([sql_data, excel_data]).drop_duplicates()

# Step 4: Count distinct Policy Nos
distinct_count = combined_data["Policy No"].nunique()

# Display the result
print(f"The distinct count of Policy No across both sources is: {distinct_count}")

The distinct count of Policy No across both sources is: 1164060


In [3]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Retrieve Policy No from PostgreSQL
sql_query = 'SELECT DISTINCT "Policy No" FROM cleaned_merged_base_data_check;'
sql_data = pd.read_sql(sql_query, con=engine)

# Step 2: Read the Excel file
excel_data = pd.read_excel("cleaned_PR dataset - Copy.xlsx", usecols=["Policy No"])

# Step 3: Combine and find distinct Policy Nos
combined_data = pd.concat([sql_data, excel_data]).drop_duplicates()

# Step 4: Count distinct Policy Nos
distinct_count = combined_data["Policy No"].nunique()

# Display the result
print(f"The distinct count of Policy No across both sources is: {distinct_count}")

The distinct count of Policy No across both sources is: 1164060


In [4]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Retrieve Policy No from PostgreSQL
sql_query = 'SELECT DISTINCT "Policy No" FROM cleaned_merged_base_data_check;'
sql_data = pd.read_sql(sql_query, con=engine)

# Step 2: Read the Excel file
excel_data = pd.read_excel("cleaned_PR dataset - Copy.xlsx", usecols=["Policy No"])

# Step 3: Find matching policies
# Convert to sets for easy comparison
sql_policy_set = set(sql_data["Policy No"])
excel_policy_set = set(excel_data["Policy No"])

# Find intersection (matching policies)
matching_policies = sql_policy_set.intersection(excel_policy_set)

# Step 4: Count matches
matching_count = len(matching_policies)

# Display the result
print(f"The number of matching Policy Nos between both sources is: {matching_count}")

The number of matching Policy Nos between both sources is: 157233
