# Initializing MinIO and PostgreSQL

In [None]:
import os, sys

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import List as mapping_list
import psycopg2
from io import BytesIO
from minio import Minio
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())


True

In [None]:
minio_client = Minio(
    "localhost:9100",
    access_key="minioadmin",
    secret_key="minioadmin",
    secure=False,
)

bucket_name = 'pulse-bucket-1'

In [None]:
spark = SparkSession.builder.appName("NormalizeData")\
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9100") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("inferSchema", "true") \
    .config("mergeSchema", "true") \
    .getOrCreate()

#  Function To load all files from minio 

In [154]:
df_to_table = {
    "customers_df": "customers",
    "addresses_df": "addresses",
    "products_df": "products",
    "inventories_df": "inventory",
    "orders_df": "orders",
    "reviews_df": "reviews",
    "categories_df": "categories",
    "wishlists_df": "wishlist",
    "payments_df": "payments",
    "order_items_df": "order_items",
    "shopping_carts_df": "shopping_cart",
    "customer_sessions_df": "customer_sessions",
    "marketing_campaigns_df": "marketing_campaigns",
    "suppliers_df": "suppliers"
}

In [155]:
import os
from difflib import get_close_matches
def normalize_name(name):
    name = name.lower().replace("_df", "")
    name = os.path.splitext(name)[0]

    possible_keys = list(df_to_table.keys())
    close = get_close_matches(
        name, possible_keys, n=1, cutoff=0.6
    )
    if close:
        return close[0]

    return None

def get_table_name(df_name):
    norm = normalize_name(df_name)
    return df_to_table.get(norm, None)

In [156]:
import os
import tempfile
import pandas as pd
from io import BytesIO
from pyspark.sql import SparkSession
import re

def load_all_files_from_minio(minio_client, bucket_name, spark):
    dataframes = {}

    objects = minio_client.list_objects(bucket_name, recursive=True)
    print("Listing available files in the bucket...")

    for obj in objects:
        file_name = obj.object_name
        print(f"Processing file: {file_name}")

        if not (file_name.endswith('.csv') or file_name.endswith('.xlsx') or 
                file_name.endswith('.parquet') or file_name.endswith('.json')):
            print(f"Skipping {file_name} - unsupported format")
            continue

        try:
            base_name = os.path.splitext(os.path.basename(file_name))[0]
            clean_name = re.sub(r"[^0-9a-zA-Z_]+", "_", base_name)
            norm_name = normalize_name(clean_name)
            if norm_name is None:
                print(f"No match found for {base_name}, skipping.")
                continue
            df = load_file_from_minio(minio_client, bucket_name, file_name)
            df_name = f"{norm_name}"
            dataframes[df_name] = df
            print(f"✅ Successfully loaded {file_name} as {df_name}")
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")

    print(f"Loaded {len(dataframes)} dataframes: {', '.join(dataframes.keys())}")
    return dataframes

import uuid

def load_file_from_minio(minio_client, bucket_name, file_name):

    obj = minio_client.get_object(bucket_name, file_name)
    data = obj.read()
    obj.close()
    obj.release_conn()

    if file_name.endswith(".csv"):
        pdf = pd.read_csv(BytesIO(data))
    elif file_name.endswith(".xlsx"):
        pdf = pd.read_excel(BytesIO(data))
    elif file_name.endswith(".parquet"):
        pdf = pd.read_parquet(BytesIO(data))
    elif file_name.endswith(".json"):
        pdf = pd.read_json(BytesIO(data))
    else:
        raise ValueError(f"Unsupported file format: {file_name}")

    temp_csv = os.path.join(
        tempfile.gettempdir(),
        f"temp_{uuid.uuid4().hex}_{os.path.basename(file_name)}.csv",
    )
    pdf.to_csv(temp_csv, index=False)

    spark_df = spark.read.csv(temp_csv, header=True, inferSchema=True).cache()
    _ = spark_df.count()

    try:
        os.remove(temp_csv)
    except Exception:
        pass

    return spark_df

In [157]:
all_dataframes = load_all_files_from_minio(minio_client, bucket_name, spark)

Listing available files in the bucket...
Processing file: addresses.xlsx
✅ Successfully loaded addresses.xlsx as addresses_df
Processing file: categories.xlsx
✅ Successfully loaded categories.xlsx as categories_df
Processing file: customer_sessions.xlsx
✅ Successfully loaded customer_sessions.xlsx as customer_sessions_df
Processing file: customers.xlsx
✅ Successfully loaded customers.xlsx as customers_df
Processing file: inventory.xlsx
✅ Successfully loaded inventory.xlsx as inventories_df
Processing file: marketing_campaigns.xlsx
✅ Successfully loaded marketing_campaigns.xlsx as marketing_campaigns_df
Processing file: order_items.xlsx
✅ Successfully loaded order_items.xlsx as order_items_df
Processing file: orders.xlsx
✅ Successfully loaded orders.xlsx as orders_df
Processing file: payments.xlsx
✅ Successfully loaded payments.xlsx as payments_df
Processing file: products.xlsx
✅ Successfully loaded products.xlsx as products_df
Processing file: reviews.xlsx
✅ Successfully loaded reviews

In [158]:
all_dataframes

{'addresses_df': DataFrame[addr_id: string, customer_ref: string, address_category: string, street_line1: string, city_name: string, state_region: string, zip_postal: string, country_name: string, default_flag: string, record_created: string, latitude: string, longitude: string, street_line2: string, address_verified: string, last_modified: string],
 'categories_df': DataFrame[cat_id: string, cat_name: string, parent_cat_id: string, active_flag: string, date_created: string, category_desc: string, product_count: string, breadcrumb_path: string, display_sequence: string, hierarchy_level: string, url_slug: string],
 'customer_sessions_df': DataFrame[session_ref: string, user_id: string, start_timestamp: string, end_timestamp: string, device_category: string, traffic_source: string, page_views: string, products_browsed: string, purchase_made: string, cart_abandoned: string, bounce_session: string, entry_page: string, exit_page: string, visitor_type: string, session_duration_sec: string, c

In [159]:
conn = psycopg2.connect(
    host="localhost",
    database=os.getenv("POSTGRES_DATABASE_NAME"),
    user=os.getenv("POSTGRES_USER"),
    password=os.getenv("POSTGRES_PASSWORD"),
)

In [160]:
cur = conn.cursor()
cur.execute(
    """
    SELECT table_name, column_name, data_type
    FROM information_schema.columns
    WHERE table_schema = 'public'
"""
)

In [161]:
columns_info = cur.fetchall()
columns_info

[('products', 'cost_price', 'numeric'),
 ('products', 'selling_price', 'numeric'),
 ('products', 'launch_date', 'date'),
 ('orders', 'order_date', 'timestamp without time zone'),
 ('products', 'weight', 'numeric'),
 ('orders', 'subtotal', 'numeric'),
 ('orders', 'tax_amount', 'numeric'),
 ('orders', 'shipping_cost', 'numeric'),
 ('orders', 'discount_amount', 'numeric'),
 ('orders', 'total_amount', 'numeric'),
 ('customer_sessions', 'session_end', 'timestamp without time zone'),
 ('customers', 'date_of_birth', 'date'),
 ('customers', 'customers_created_at', 'timestamp without time zone'),
 ('orders', 'shipped_date', 'timestamp without time zone'),
 ('orders', 'delivered_date', 'timestamp without time zone'),
 ('orders', 'order_created_at', 'timestamp without time zone'),
 ('customer_sessions', 'pages_viewed', 'integer'),
 ('customer_sessions', 'products_viewed', 'integer'),
 ('products', 'is_digital', 'boolean'),
 ('order_items', 'quantity', 'integer'),
 ('order_items', 'unit_price', 'n

In [162]:
cur.close()
conn.close()

# Mapping with Predefined Lists

In [163]:
import importlib
importlib.reload(mapping_list)

<module 'List' from '/mnt/01DBA8B8279979A0/Pulse - E-Commerce Data Analytics Engine/mapping/List.py'>

In [164]:
def normalize_dataframe(df, column_variants, mapped_cols):
    variant_to_standard = {
        v.lower(): std_col
        for std_col, variants in column_variants.items()
        for v in variants
    }

    new_columns = []
    for col in df.columns:
        col_lower = col.lower()
        if col_lower in variant_to_standard:
            std_col = variant_to_standard[col_lower]
            new_columns.append(std_col)
            mapped_cols[std_col] = col
        else:
            new_columns.append(col)

    for old_col, new_col in zip(df.columns, new_columns):
        df = df.withColumnRenamed(old_col, new_col)

    missing_cols = []
    for std_col in column_variants.keys():
        if std_col not in df.columns:
            df = df.withColumn(std_col, lit(None))
            missing_cols.append(std_col)

    schema_cols = list(column_variants.keys())
    extra_cols = [c for c in df.columns if c not in schema_cols]

    new_df = df.select(schema_cols)
    df_extra = df.select(schema_cols + extra_cols)
    

    return new_df, df_extra, extra_cols, missing_cols, mapped_cols

In [165]:
# base_dir = os.getcwd()
# file_path_excel = os.path.join(base_dir, "./../faker/messy_inventory_data.xlsx")
# file_path_csv = os.path.join(base_dir, "./../faker/messy_inventory_data.csv")

In [166]:
# excel_df = pd.read_excel(file_path_excel, engine="openpyxl")
# excel_df.to_csv(file_path_csv, index=False)

In [167]:
# df = spark.read.csv(file_path_csv, header=True, inferSchema=True)

In [168]:
# df.show(5)

# Implementing RapidFuzz

In [169]:
from rapidfuzz import fuzz, process


def rapidfuzz_column_mapping(df, missing_cols, extra_cols, mapped_cols, threshold=85):
    """
    Map columns using RapidFuzz's string matching algorithms
    Returns normalized dataframe, remaining missing columns, extra columns, and mapped columns
    """
    for missing_col in missing_cols[:]:
        match = process.extractOne(
            missing_col,
            extra_cols,
            scorer=fuzz.ratio,
            score_cutoff=threshold,
        )
        if match:
            best_match, score = match[0], match[1]
            print(f"RapidFuzz Mapping: {best_match} -> {missing_col}: {score:.2f}")
            mapped_cols[missing_col] = best_match
            missing_cols.remove(missing_col)
            extra_cols.remove(best_match)
    for new_col, old_col in mapped_cols.items():
        df = df.withColumnRenamed(old_col, new_col)

    df = df.drop(*extra_cols)
    for col in missing_cols:
        df = df.withColumn(col, lit(None))

    return df, missing_cols, extra_cols, mapped_cols

# Implementing NLTK

In [170]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.metrics.distance import edit_distance
from difflib import SequenceMatcher
import re

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/khalid_ah_1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/khalid_ah_1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/khalid_ah_1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/khalid_ah_1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/khalid_ah_1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [171]:
def preprocess_column_name(column):
    words = "".join(c if c.isalnum() else " " for c in column).split()
    result = []
    for word in words:
        result.extend(filter(None, re.split("([A-Z][a-z]*)", word)))
    return [w.lower() for w in result if w]

#### Implementing Jaccard Similarity

In [172]:
def jaccard_similarity(source_column, target_column):
    source_col = preprocess_column_name(source_column)
    target_col = preprocess_column_name(target_column)
    intersection = len(set(source_col).intersection(set(target_col)))
    union = len(set(source_col).union(set(target_col)))
    jaccard_similarity = intersection / union if union > 0 else 0
    return jaccard_similarity

#### Implementing Sequence Matcher

In [173]:
def sequence_matching(source_column, target_column):
    source_col = preprocess_column_name(source_column)
    target_col = preprocess_column_name(target_column)
    return SequenceMatcher(None, source_col, target_col).ratio()

#### Implementing Edit Distance

In [174]:
def editing_distance(source_column, target_column):
    source_col = preprocess_column_name(source_column)
    target_col = preprocess_column_name(target_column)
    max_len = max(len(source_col), len(target_col))
    return 1 - (edit_distance(source_col, target_col) / max_len)

#### Combining them all

In [175]:
def mapping_with_combination(df, missing_cols, extra_cols, mapped_cols, threshold=0.87):
    for missing_col in missing_cols[:]:
        best_match = None
        best_score = threshold
        for extra_col in extra_cols[:]:
            final = (
                0.4 * jaccard_similarity(missing_col, extra_col)
                + 0.3 * sequence_matching(missing_col, extra_col)
                + 0.3 * editing_distance(missing_col, extra_col)
            )
            if final > best_score:
                best_score = final
                best_match = extra_col

        if best_match and best_score > threshold:
            print(f"NLTK Mapping: {best_match} -> {missing_col}: {best_score:.2f}")
            mapped_cols[missing_col] = best_match
            missing_cols.remove(missing_col)
            extra_cols.remove(best_match)

    for new_col, old_col in mapped_cols.items():
        df = df.withColumnRenamed(old_col, new_col)
    df = df.drop(*extra_cols)
    for col in missing_cols:
        df = df.withColumn(col, lit(None))
    return df, missing_cols, extra_cols, mapped_cols

#### Wordnet Mapping

In [176]:
# def preprocess_column_name(column):
#     name = re.sub("([A-Z][a-z]+)", r" \1", column)
#     name = re.sub("_", " ", name)
#     tokens = word_tokenize(name.lower())
#     return [token for token in tokens if token.isalpha()]

In [177]:
def get_wordnet_synsets(word):
    return wordnet.synsets(word)

In [178]:
def calculate_semantic_similarity(missing_col, extra_col):
    missing_tokens = preprocess_column_name(missing_col)
    extra_tokens = preprocess_column_name(extra_col)
    if not missing_tokens or not extra_tokens:
        return 0.0
    max_similarities = []
    for token1 in missing_tokens:
        synsets1 = get_wordnet_synsets(token1)
        if not synsets1:
            continue
        token_similarities = []
        for token2 in extra_tokens:
            synsets2 = get_wordnet_synsets(token2)
            if not synsets2:
                continue
            similarities = [
                s1.path_similarity(s2)
                for s1 in synsets1
                for s2 in synsets2
                if s1.path_similarity(s2) is not None
            ]
            if similarities:
                token_similarities.append(max(similarities))
        if token_similarities:
            max_similarities.append(max(token_similarities))
    return sum(max_similarities) / len(max_similarities) if max_similarities else 0.0

In [179]:
def semantic_column_mapping(df, missing_cols, extra_cols, mapped_cols, threshold=0.6):
    for missing_col in missing_cols[:]:
        best_match = None
        best_score = threshold
        for extra_col in extra_cols[:]:
            similarity = calculate_semantic_similarity(missing_col, extra_col)
            if similarity > best_score:
                best_score = similarity
                best_match = extra_col
        if best_match:
            print(f"Wordnet Mapping: {best_match} -> {missing_col}: {best_score:.2f}")
            mapped_cols[missing_col] = best_match
            missing_cols.remove(missing_col)
            extra_cols.remove(best_match)

    for new_col, old_col in mapped_cols.items():
        df = df.withColumnRenamed(old_col, new_col)
    df = df.drop(*extra_cols)
    for col in missing_cols:
        df = df.withColumn(col, lit(None))
    return df, missing_cols, extra_cols, mapped_cols

# Implementing spaCy

In [180]:
import spacy

def spacy_column_mapping(
    df,
    missing_cols,
    extra_cols,
    mapped_cols,
    threshold=0.87
):
    nlp = spacy.load("en_core_web_md")

    for missing_col in missing_cols[:]:
        best_match = None
        best_score = threshold
        missing_doc = nlp(" ".join(preprocess_column_name(missing_col)))

        for extra_col in extra_cols[:]:
            extra_doc = nlp(" ".join(preprocess_column_name(extra_col)))
            similarity = missing_doc.similarity(extra_doc)

            if similarity > best_score:
                best_score = similarity
                best_match = extra_col

        if best_match:
            print(f"spaCy Mapping: {best_match} -> {missing_col}: {best_score:.2f}")
            mapped_cols[missing_col] = best_match
            missing_cols.remove(missing_col)
            extra_cols.remove(best_match)

    for new_col, old_col in mapped_cols.items():
        df = df.withColumnRenamed(old_col, new_col)
    df = df.drop(*extra_cols)
    for col in missing_cols:
        df = df.withColumn(col, lit(None))
    return df, missing_cols, extra_cols, mapped_cols

# Implementing Word2Vec

In [181]:
import numpy as np
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import re


def load_word2vec_model(df, extra_df):
    all_columns = list(set(df.columns + extra_df.columns))
    
    try:
        model = KeyedVectors.load_word2vec_format(
            "GoogleNews-vectors-negative300.bin", binary=True
        )
    except:
        sentences = [preprocess_column_name(col) for col in all_columns]
        model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
        model = model.wv
    return model


def calculate_word2vec_similarity(col1, col2, model):
    words1 = preprocess_column_name(col1)
    words2 = preprocess_column_name(col2)

    if not words1 or not words2:
        return 0.0
    vec1 = []
    vec2 = []

    for word in words1:
        try:
            vec1.append(model[word])
        except KeyError:
            continue

    for word in words2:
        try:
            vec2.append(model[word])
        except KeyError:
            continue

    if not vec1 or not vec2:
        return 0.0

    vec1_avg = np.mean(vec1, axis=0)
    vec2_avg = np.mean(vec2, axis=0)

    similarity = np.dot(vec1_avg, vec2_avg) / (
        np.linalg.norm(vec1_avg) * np.linalg.norm(vec2_avg)
    )
    return float(similarity)


def word2vec_column_mapping(df, extra_df, missing_cols, extra_cols, mapped_cols, threshold=0.87):
    model = load_word2vec_model(df, extra_df)

    for missing_col in missing_cols[:]:
        best_match = None
        best_score = threshold

        for extra_col in extra_cols[:]:
            similarity = calculate_word2vec_similarity(missing_col, extra_col, model)
            if similarity > best_score:
                best_score = similarity
                best_match = extra_col

        if best_match:
            print(f"Word2Vec Mapping: {best_match} -> {missing_col}: {best_score:.2f}")
            mapped_cols[missing_col] = best_match
            missing_cols.remove(missing_col)
            extra_cols.remove(best_match)

    for new_col, old_col in mapped_cols.items():
        df = df.withColumnRenamed(old_col, new_col)
    df = df.drop(*extra_cols)
    for col in missing_cols:
        df = df.withColumn(col, lit(None))
    return df, missing_cols, extra_cols, mapped_cols

# Implementing Hugging Face RoBERTa Transformer

In [182]:
from sentence_transformers import SentenceTransformer, util
import torch

roberta = SentenceTransformer("all-MiniLM-L12-v2")

In [183]:
def roberta_similarity(df, missing_cols, extra_cols, mapped_cols, threshold=0.87):
    missing_embeddings = roberta.encode(missing_cols, convert_to_tensor=True)
    extra_embeddings = roberta.encode(extra_cols, convert_to_tensor=True)

    for i, missing_col in enumerate(missing_cols):
        sims = util.cos_sim(missing_embeddings[i], extra_embeddings)[0]
        best_score, best_idx = torch.max(sims, dim=0)
        best_score = best_score.item()
        best_match = extra_cols[best_idx]

        if best_score >= threshold:
            print(
                f"BERT Mapping: {best_match} -> {missing_col} (score={best_score:.2f})"
            )
            mapped_cols[missing_col] = best_match
            missing_cols.remove(missing_col)
            extra_cols.remove(best_match)
    for schema_col, df_col in mapped_cols.items():
        df = df.withColumnRenamed(df_col, schema_col)
    df = df.drop(*extra_cols)
    for col in missing_cols:
        df = df.withColumn(col, lit(None))
    return df, missing_cols, extra_cols, mapped_cols

# Implementing GPT

In [184]:
# import json
# from openai import OpenAI

# load_dotenv()

# client = OpenAI(
#     base_url="https://router.huggingface.co/v1",
#     api_key=os.getenv("HF_TOKEN"),
# )

In [185]:
def make_json_safe(data):
    """Recursively convert all values in a dict or list to JSON-safe types."""
    if isinstance(data, dict):
        return {k: make_json_safe(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [make_json_safe(v) for v in data]
    else:
        return safe_serialize(data)

In [194]:
import json
import requests
from pyspark.sql.functions import lit

FREEGPT4_API_URL = (
    "http://localhost:5500/v1/chat/completions"  # adjust if hosted elsewhere
)


def gpt_schema_mapping(df, missing_cols, extra_cols, mapped_cols):
    pdf = df.limit(5).toPandas()

    schema_info = {
        "missing_cols": missing_cols,
        "extra_cols": extra_cols,
        "already_mapped": mapped_cols,
        "sample_data": pdf.to_dict(orient="list"),
    }

    # ensure schema_info is JSON safe
    def make_json_safe(obj):
        if isinstance(obj, dict):
            return {str(k): make_json_safe(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [make_json_safe(v) for v in obj]
        elif isinstance(obj, (int, float, str)) or obj is None:
            return obj
        return str(obj)

    schema_info = make_json_safe(schema_info)

    prompt = f"""
        You are a data engineer helping with schema alignment.

        I have a dataset with extra columns and some missing schema columns.
        Please map extra columns to missing schema columns based on semantics and sample data.

        Input (JSON):
        {json.dumps(schema_info, indent=2)}

        Return ONLY valid JSON with this exact structure:
        {{
        "mapped_cols": {{"schema_col": "df_col", ...}},
        "remaining_missing_cols": [],
        "remaining_extra_cols": []
        }}
    """

    # Call FreeGPT4-WEB-API instead of OpenAI
    response = requests.post(
        FREEGPT4_API_URL,
        json={
            "model": "gpt-4.0",  # adjust model name based on what FreeGPT4 API supports
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0,
        },
        # timeout=60,
    )

    print("Full response:", response.json())

    try:
        result_text = response.json()["choices"][0]["message"]["content"].strip()
        model_mapping = json.loads(result_text)

        mapped_cols.update(model_mapping["mapped_cols"])
        missing_cols = model_mapping["remaining_missing_cols"]
        extra_cols = model_mapping["remaining_extra_cols"]

        # rename + drop + add cols in df
        for schema_col, df_col in model_mapping["mapped_cols"].items():
            df = df.withColumnRenamed(df_col, schema_col)
        df = df.drop(*extra_cols)
        for col in missing_cols:
            df = df.withColumn(col, lit(None))

    except Exception as e:
        print("Error parsing model output:", e)
        model_mapping = {}

    return df, missing_cols, extra_cols, mapped_cols

In [195]:
# def gptoss_schema_mapping(df, missing_cols, extra_cols, mapped_cols):
#     pdf = df.limit(5).toPandas()

#     schema_info = {
#         "missing_cols": missing_cols,
#         "extra_cols": extra_cols,
#         "already_mapped": mapped_cols,
#         "sample_data": pdf.to_dict(orient="list"),
#     }
    
#     schema_info = make_json_safe(schema_info)

#     prompt = f"""
#         You are a data engineer helping with schema alignment.

#         I have a dataset with extra columns and some missing schema columns.
#         Please map extra columns to missing schema columns based on semantics and sample data.

#         Input (JSON):
#         {json.dumps(schema_info, indent=2)}

#         Return ONLY valid JSON with this exact structure:
#         {{
#         "mapped_cols": {{"schema_col": "df_col", ...}},
#         "remaining_missing_cols": [],
#         "remaining_extra_cols": []
#         }}
#     """

#     response = client.responses.create(
#         model="openai/gpt-oss-20b",
#         input=prompt,
#     )
#     print("Full response:", response)

#     result = response.output_text.strip()
#     try:
#         model_mapping = json.loads(result)

#         mapped_cols.update(model_mapping["mapped_cols"])
#         missing_cols = model_mapping["remaining_missing_cols"]
#         extra_cols = model_mapping["remaining_extra_cols"]

#         for schema_col, df_col in model_mapping["mapped_cols"].items():
#             df = df.withColumnRenamed(df_col, schema_col)
#         df = df.drop(*extra_cols)
#         for col in missing_cols:
#             df = df.withColumn(col, lit(None))

#     except Exception as e:
#         print("Error parsing model output:", e)
#         model_mapping = {}

#     return df, missing_cols, extra_cols, mapped_cols

# Mapping Function

In [196]:
def mapping(df, column_variants , mapped):
    new_df, extra_df, extra_cols, missing_cols, mapped_cols = normalize_dataframe(
        df, column_variants, mapped
    )
    if missing_cols:
        print("\nAfter Initial Normalization:")
        print(f"Missing columns: {missing_cols}")
        print(new_df.columns)
        print("Implementing RapidFuzz Mapping...")
        new_df, missing_cols, extra_cols, mapped_cols = rapidfuzz_column_mapping(
            df, missing_cols, extra_cols, mapped_cols, threshold=70
        )

    if missing_cols:
        print("\nAfter RapidFuzz Mapping:")
        print(f"Missing columns: {missing_cols}")
        print(new_df.columns)
        print("Implementing NLTK Combination Mapping...")
        new_df, missing_cols, extra_cols, mapped_cols = mapping_with_combination(
            df, missing_cols, extra_cols, mapped_cols, threshold=0.7
        )

    if missing_cols:
        print("\nAfter NLTK Combination Mapping:")
        print(f"Missing columns: {missing_cols}")
        print(new_df.columns)
        print("Implementing WordNet Semantic Mapping...")
        new_df, missing_cols, extra_cols, mapped_cols = semantic_column_mapping(
            df, missing_cols, extra_cols, mapped_cols, threshold=0.7
        )

    if missing_cols:
        print("\nAfter WordNet Semantic Mapping:")
        print(f"Missing columns: {missing_cols}")
        print(new_df.columns)
        print("Implementing spaCy Mapping...")
        new_df, missing_cols, extra_cols, mapped_cols = spacy_column_mapping(
            df, missing_cols, extra_cols, mapped_cols, threshold=0.7
        )

    if missing_cols:
        print("\nAfter spaCy Mapping:")
        print(f"Missing columns: {missing_cols}")
        
        print("Implementing Word2Vec Mapping...")
        new_df, missing_cols, extra_cols, mapped_cols = word2vec_column_mapping(
            df, extra_df, missing_cols, extra_cols, mapped_cols
        )

    if missing_cols:
        print("\nAfter Word2Vec Mapping:")
        print(f"Missing columns: {missing_cols}")
        print(new_df.columns)
        print("Implementing BERT Mapping...")
        new_df, missing_cols, extra_cols, mapped_cols = roberta_similarity(
            df, missing_cols, extra_cols, mapped_cols, threshold=0.7
        )

    if missing_cols:
        print("\nAfter roBERTa Mapping:")
        print(f"Missing columns: {missing_cols}")
        print(new_df.columns)
        print("Implementing GPT Mapping...")
        new_df, missing_cols, extra_cols, mapped_cols = gpt_schema_mapping(
            df, missing_cols, extra_cols, mapped_cols
        )

    return new_df, extra_df, extra_cols, missing_cols, mapped_cols

# Processing & Mapping

In [197]:
import datetime

def safe_serialize(obj):
    if isinstance(obj, (datetime.date, datetime.datetime)):
        return obj.isoformat()
    if isinstance(obj, set):
        return list(obj)
    return str(obj)

In [198]:
def detect_table(df, columns_info, threshold=0.5):
    df_cols = set(df.columns)
    table_scores = {}

    for table, col, _ in columns_info:
        if table not in table_scores:
            table_scores[table] = 0
        if col in df_cols:
            table_scores[table] += 1

    best_table = max(table_scores, key=table_scores.get)
    overlap_ratio = table_scores[best_table] / max(1, len(df_cols))

    if overlap_ratio >= threshold:
        return best_table
    return None


In [199]:
def split_unified_dataframe(df, columns_info):
    """
    Splits a unified dataframe into per-table sub-dataframes
    based on the strict rule that columns are named `table__column`.
    """

    # Build canonical column set per table for validation
    canonical_schema = {
        table: {col for t, col, _ in columns_info if t == table}
        for table, _, _ in columns_info
    }

    table_to_cols = {}

    for col in df.columns:
        if "__" not in col:
            raise ValueError(
                f"❌ Invalid column '{col}'. All columns must follow 'table__column' format."
            )

        table, column = col.split("__", 1)

        if table not in canonical_schema:
            raise ValueError(f"❌ Unknown table '{table}' in column '{col}'.")

        if column not in canonical_schema[table]:
            raise ValueError(
                f"❌ Unknown column '{column}' for table '{table}' (from '{col}')."
            )

        table_to_cols.setdefault(table, []).append((col, column))

    # Build sub-dataframes
    split_dfs = {}
    for table, col_pairs in table_to_cols.items():
        renamed = [df[c].alias(new_c) for c, new_c in col_pairs]
        split_dfs[table] = df.select(*renamed)

    return split_dfs

In [None]:
def process_all_dataframes(all_dataframes, columns_info, mapping_list, mode="batch"):
    """
    Unified processor for schema mapping.

    Supports:
    1. Single unified file (all tables in one DataFrame).
    2. Multiple files (one or more tables per file).
    3. Streaming (micro-batches, API, DB source).

    Parameters
    ----------
    all_dataframes : dict
        Dict of {df_name: dataframe} from files, unified table, or stream.
    columns_info : list
        List of (table_name, column_name, data_type) from PostgreSQL schema.
    mapping_list : object
        Object containing mapping_dict_<table> for each table.
    mode : str
        "batch" for files (single or multiple), "stream" for streaming.

    Returns
    -------
    results : dict
        Dict with results per canonical table.
    """

    df_to_mapping_dict = {
        "customer_df": mapping_list.mapping_dict_customers,
        "product_df": mapping_list.mapping_dict_products,
        "inventory_df": mapping_list.mapping_dict_inventory,
        "orders_df": mapping_list.mapping_dict_orders,
        "reviews_df": mapping_list.mapping_dict_reviews,
        "wishlist_df": mapping_list.mapping_dict_wishlist,
        "payments_df": mapping_list.mapping_dict_payments,
        "order_items_df": mapping_list.mapping_dict_order_items,
        "shopping_cart_df": mapping_list.mapping_dict_shopping_cart,
        "customer_sessions_df": mapping_list.mapping_dict_customer_sessions,
        "marketing_campaigns_df": mapping_list.mapping_dict_marketing_campaigns,
        "suppliers_df": mapping_list.mapping_dict_suppliers,
    }

    results = {}

    # 2. Iterate over incoming dataframes
    for df_name, df in all_dataframes.items():
        print(f"\n{'='*50}")
        print(f"Incoming dataframe: {df_name}")

        if mode == "stream":
            detected_table = detect_table(df, columns_info)
            if detected_table:
                split_dfs = {detected_table: df}
            else:
                print(f"⚠️ Could not detect schema for streaming dataframe {df_name}")
                continue
        else:
            if df_name in df_to_table:
                split_dfs = {df_to_table[df_name]: df}
            else:
                split_dfs = split_unified_dataframe(df, columns_info)

        for table_name, sub_df in split_dfs.items():
            mapping_dict = getattr(mapping_list, f"mapping_dict_{table_name}", None)
            if not mapping_dict:
                print(f"⚠️ No mapping dict found for {table_name}. Skipping.")
                continue

            mapped = {col: "" for t, col, _ in columns_info if t == table_name}
            print(f"Processing → {table_name} with {len(mapped)} canonical columns")

        # try:
            final_df, extra_df, extra_cols, missing_cols, mapped_cols = mapping(
                sub_df, mapping_dict, mapped
            )

            # ✅ Sanitize lists before putting them into results
            results[f"{table_name}"] = {
                "table_name": table_name,
                "final_df": final_df,  # keep Spark DF
                "extra_df": extra_df,  # keep Spark DF
                "extra_cols": [safe_serialize(c) for c in extra_cols],
                "missing_cols": [safe_serialize(c) for c in missing_cols],
                "mapped_cols": [safe_serialize(c) for c in mapped_cols],
            }

            print(f"✅ Completed {df_name} → {table_name}")
            print(f"   Missing cols: {missing_cols}")
            print(f"   Extra cols: {extra_cols}")
            print("   Preview:")
            final_df.show(3)

        # except Exception as e:
            # print(f"❌ Error processing {df_name}/{table_name}: {str(e)}")

    return results


In [201]:
results = process_all_dataframes(all_dataframes, columns_info, mapping_list)


Incoming dataframe: addresses_df
Processing → addresses with 9 canonical columns

After Initial Normalization:
Missing columns: ['address_type', 'state_province', 'postal_code']
['address_id', 'customer_id', 'address_type', 'city', 'state_province', 'postal_code', 'country', 'is_default', 'created_at']
Implementing RapidFuzz Mapping...
RapidFuzz Mapping: address_category -> address_type: 71.43

After RapidFuzz Mapping:
Missing columns: ['state_province', 'postal_code']
['address_id', 'customer_id', 'address_type', 'city', 'country', 'is_default', 'created_at', 'state_province', 'postal_code']
Implementing NLTK Combination Mapping...

After NLTK Combination Mapping:
Missing columns: ['state_province', 'postal_code']
['address_id', 'customer_id', 'address_type', 'city', 'country', 'is_default', 'created_at', 'state_province', 'postal_code']
Implementing WordNet Semantic Mapping...
Wordnet Mapping: state_region -> state_province: 1.00
Wordnet Mapping: zip_postal -> postal_code: 0.75
✅ Co

In [None]:
def save_dataframes_to_minio(results, client, bucket_name):
    client = Minio(
    "localhost:9000",
    access_key="minioadmin",
    secret_key="minioadmin",
    secure=False,
    )

    bucket_name = 'mapped'
    
    if not client.bucket_exists(bucket_name):
        client.make_bucket(bucket_name)
        print(f"Created bucket: {bucket_name}")
    else:
        print(f"Bucket already exists: {bucket_name}")
    
    for result_key, result_data in results.items():
        table_name = result_data['table_name']
        final_df = result_data['final_df']
        
        print(f"Saving {table_name} to MinIO...")
        
        # Convert Spark DataFrame to Pandas
        pdf = final_df.toPandas()
        
        # Save as CSV to MinIO
        csv_buffer = BytesIO()
        pdf.to_csv(csv_buffer, index=False)
        csv_buffer.seek(0)
        
        # Upload to MinIO
        file_name = table_name
        minio_client.put_object(
            bucket_name,
            file_name,
            csv_buffer,
            length=len(csv_buffer.getvalue()),
            content_type='text/csv'
        )

        print(f"✅ Saved {file_name} ({len(pdf)} rows)")
        csv_buffer.close()


# save_dataframes_to_minio(results, client, bucket_name)

In [203]:
results

{'addresses_df__addresses': {'table_name': 'addresses',
  'final_df': DataFrame[address_id: string, customer_id: string, address_type: string, city: string, state_province: string, postal_code: string, country: string, is_default: string, created_at: string],
  'extra_df': DataFrame[address_id: string, customer_id: string, address_type: void, city: string, state_province: void, postal_code: void, country: string, is_default: string, created_at: string, address_category: string, street_line1: string, state_region: string, zip_postal: string, latitude: string, longitude: string, street_line2: string, address_verified: string, last_modified: string],
  'extra_cols': ['street_line1',
   'latitude',
   'longitude',
   'street_line2',
   'address_verified',
   'last_modified'],
  'missing_cols': [],
  'mapped_cols': ['is_default',
   'address_created_at',
   'address_id',
   'customer_id',
   'address_type',
   'city',
   'state_province',
   'postal_code',
   'country',
   'created_at']},
 

In [None]:
results["customers"]["missing_cols"]

[]

In [None]:
results["customers"]["final_df"].show(5)


+-----------+------------------+-------------+------+-------------------+--------------------+---------------+-------------------+----------------+--------------------+
|customer_id|     customer_name|customer_type|gender|      date_of_birth|   registration_date|customer_status|acquisition_channel|customer_segment|          created_at|
+-----------+------------------+-------------+------+-------------------+--------------------+---------------+-------------------+----------------+--------------------+
|      10369|   Cynthia Gregory|          B2C|  Male|1971-10-04 00:00:00|2022-11-11 11:16:...|       Inactive|           LinkedIn|      High Value|2024-12-21 08:06:...|
|      10078|   Ms Karen Turner|       Guest*|Female|1994-04-30 00:00:00|2025-05-16 17:36:...|       Inactive|            Twitter|Occasional Buyer|2025-07-26 11:31:...|
|      10299|     Maureen Lewis|          B2C| Other|1996-07-11 00:00:00|2022-12-17 23:00:...|        Blocked|           Referral|         Churned|2025-05-