In [1]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from sqlalchemy import create_engine

# ---------------------------
# Step 0: Database Connection
# ---------------------------
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# ---------------------------
# Step 1: Load Data
# ---------------------------
query = "SELECT * FROM basiccleaned_appended_base_and_pr;"
df = pd.read_sql(query, con=engine)
print(f"Loaded {len(df)} records from 'basiccleaned_appended_base_and_pr'.")

Loaded 2112217 records from 'basiccleaned_appended_base_and_pr'.


In [2]:
# ---------------------------
# Step 1.1: Separate Data into Valid and Invalid Groups
# ---------------------------
REG_NO_COLUMN = "Cleaned Reg no"
CHASSIS_COLUMN = "Cleaned Chassis Number"
ENGINE_COLUMN = "Cleaned Engine Number"
MODEL_COLUMN = "model"

# Separate rows where Cleaned Reg no is NOT null and does NOT contain 'new' (valid rows)
valid_df = df[df[REG_NO_COLUMN].notnull() & ~df[REG_NO_COLUMN].str.contains("new", case=False, na=False)].copy()

# The invalid rows (where Cleaned Reg no is null or contains 'new') remain unchanged
invalid_df = df[~(df[REG_NO_COLUMN].notnull() & ~df[REG_NO_COLUMN].str.contains("new", case=False, na=False))].copy()

print(f"Valid records for cleaning: {len(valid_df)}")
print(f"Invalid records (unchanged): {len(invalid_df)}")

# ---------------------------
# Step 2: Clean Valid Data
# ---------------------------
# Ensure chassis & engine columns in the valid subset are strings and fill NaNs with empty strings.
valid_df[CHASSIS_COLUMN] = valid_df[CHASSIS_COLUMN].astype(str).fillna("")
valid_df[ENGINE_COLUMN] = valid_df[ENGINE_COLUMN].astype(str).fillna("")

print("Creating lookup dictionaries for chassis & engine numbers (valid records only)...")

# Create lookup dictionaries for valid rows by grouping on Cleaned Reg no and model.
chassis_lookup = (
    valid_df.groupby([REG_NO_COLUMN, MODEL_COLUMN])[CHASSIS_COLUMN]
      .apply(lambda x: max(x, key=len))  # Get the longest chassis number in each group.
      .to_dict()
)
engine_lookup = (
    valid_df.groupby([REG_NO_COLUMN, MODEL_COLUMN])[ENGINE_COLUMN]
      .apply(lambda x: max(x, key=len))  # Get the longest engine number in each group.
      .to_dict()
)

# Update the chassis and engine numbers in the valid subset using the lookup dictionaries.
valid_df[CHASSIS_COLUMN] = valid_df[[REG_NO_COLUMN, MODEL_COLUMN]].apply(
    lambda x: chassis_lookup.get(tuple(x), ""), axis=1
)
valid_df[ENGINE_COLUMN] = valid_df[[REG_NO_COLUMN, MODEL_COLUMN]].apply(
    lambda x: engine_lookup.get(tuple(x), ""), axis=1
)

# ---------------------------
# Step 3: Combine Cleaned Valid Data with Unchanged Invalid Data
# ---------------------------
final_df = pd.concat([valid_df, invalid_df], ignore_index=True)
print(f"Total records in final output: {len(final_df)}")

Valid records for cleaning: 1645945
Invalid records (unchanged): 466272
Creating lookup dictionaries for chassis & engine numbers (valid records only)...
Total records in final output: 2112217


In [3]:
# ---------------------------
# Step 3: Load Cleaned Data
# ---------------------------
TARGET_TABLE = "cleanchassisengine_basiccleaned_appended_base_and_pr"
print(f"Loading cleaned data into '{TARGET_TABLE}'...")
final_df.to_sql(name=TARGET_TABLE, con=engine, if_exists="replace", index=False, chunksize=10000)
print(f"Cleaned data successfully loaded into '{TARGET_TABLE}'.")

Loading cleaned data into 'cleanchassisengine_basiccleaned_appended_base_and_pr'...
Cleaned data successfully loaded into 'cleanchassisengine_basiccleaned_appended_base_and_pr'.


In [4]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from sqlalchemy import create_engine

# ---------------------------
# Step 0: Database Connection
# ---------------------------
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# ---------------------------
# Step 1: Load Data
# ---------------------------
query = "SELECT * FROM public.samechassisno_differregno;"
df = pd.read_sql(query, con=engine)
print(f"Loaded {len(df)} records from 'public.samechassisno_differregno'.")

Loaded 224371 records from 'public.samechassisno_differregno'.


In [5]:
# ---------------------------
# Step 1.1: Separate Data into Valid and Invalid Groups
# ---------------------------
REG_NO_COLUMN = "Cleaned Reg no"
CHASSIS_COLUMN = "Cleaned Chassis Number"
ENGINE_COLUMN = "Cleaned Engine Number"
MODEL_COLUMN = "model"

# Separate rows where Cleaned Reg no is NOT null and does NOT contain 'new' (valid rows)
valid_df = df[df[REG_NO_COLUMN].notnull() & ~df[REG_NO_COLUMN].str.contains("new", case=False, na=False)].copy()

# The invalid rows (where Cleaned Reg no is null or contains 'new') remain unchanged
invalid_df = df[~(df[REG_NO_COLUMN].notnull() & ~df[REG_NO_COLUMN].str.contains("new", case=False, na=False))].copy()

print(f"Valid records for cleaning: {len(valid_df)}")
print(f"Invalid records (unchanged): {len(invalid_df)}")

# ---------------------------
# Step 2: Clean Valid Data
# ---------------------------
# Ensure chassis & engine columns in the valid subset are strings and fill NaNs with empty strings.
valid_df[CHASSIS_COLUMN] = valid_df[CHASSIS_COLUMN].astype(str).fillna("")
valid_df[ENGINE_COLUMN] = valid_df[ENGINE_COLUMN].astype(str).fillna("")

print("Creating lookup dictionaries for chassis & engine numbers (valid records only)...")

# Create lookup dictionaries for valid rows by grouping on Cleaned Reg no and model.
chassis_lookup = (
    valid_df.groupby([REG_NO_COLUMN, MODEL_COLUMN])[CHASSIS_COLUMN]
      .apply(lambda x: max(x, key=len))  # Get the longest chassis number in each group.
      .to_dict()
)
engine_lookup = (
    valid_df.groupby([REG_NO_COLUMN, MODEL_COLUMN])[ENGINE_COLUMN]
      .apply(lambda x: max(x, key=len))  # Get the longest engine number in each group.
      .to_dict()
)

# Update the chassis and engine numbers in the valid subset using the lookup dictionaries.
valid_df[CHASSIS_COLUMN] = valid_df[[REG_NO_COLUMN, MODEL_COLUMN]].apply(
    lambda x: chassis_lookup.get(tuple(x), ""), axis=1
)
valid_df[ENGINE_COLUMN] = valid_df[[REG_NO_COLUMN, MODEL_COLUMN]].apply(
    lambda x: engine_lookup.get(tuple(x), ""), axis=1
)

# ---------------------------
# Step 3: Combine Cleaned Valid Data with Unchanged Invalid Data
# ---------------------------
final_df = pd.concat([valid_df, invalid_df], ignore_index=True)
print(f"Total records in final output: {len(final_df)}")

Valid records for cleaning: 125956
Invalid records (unchanged): 98415
Creating lookup dictionaries for chassis & engine numbers (valid records only)...
Total records in final output: 224371


In [6]:
# ---------------------------
# Step 3: Load Cleaned Data
# ---------------------------
TARGET_TABLE = "cleanchassisengine_samechassisno_differregno"
print(f"Loading cleaned data into '{TARGET_TABLE}'...")
final_df.to_sql(name=TARGET_TABLE, con=engine, if_exists="replace", index=False, chunksize=10000)
print(f"Cleaned data successfully loaded into '{TARGET_TABLE}'.")

Loading cleaned data into 'cleanchassisengine_samechassisno_differregno'...
Cleaned data successfully loaded into 'cleanchassisengine_samechassisno_differregno'.
