In [24]:
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
import torch
from sentence_transformers import SentenceTransformer
from typing import List, Optional
from dataclasses import dataclass

import teradatasql
from sqlalchemy import create_engine
from dotenv import load_dotenv

os.makedirs('../results', exist_ok=True)


import sys
sys.path.append('..')
from models import SentenceEmbeddingModel, SentenceEmbeddingConfig
from utils import load_embedding_model
from constants import (
    CLEANED_TEST_DATA_PATH,
    ENCODED_TEST_DATA_PATH,
    CLEANED_TRAIN_DATA_PATH
)




In [12]:
# @dataclass
# class SentenceEmbeddingConfig:
#     device: str
#     dtype: str
#     model_id: str
#     truncate_dim: Optional[int]
#     convert_to_numpy: bool
#     convert_to_tensor: bool

# class SentenceEmbeddingModel:
#     def __init__(self, config: SentenceEmbeddingConfig):
#         super().__init__()
#         self.config = config
#         self.model_id = config.model_id
#         self.device = config.device
#         self.dtype = config.dtype
#         self.truncate_dim = config.truncate_dim

#         self.model = SentenceTransformer(
#             self.model_id,
#             device=self.device,
#             truncate_dim=self.truncate_dim,
#             model_kwargs={"torch_dtype": getattr(torch, self.dtype) if isinstance(self.dtype, str) else self.dtype}
#         )

#     def get_embeddings(self, texts: List[str], prompt_name: Optional[str] = None):
#         embeddings = self.model.encode(
#             texts, 
#             prompt_name=prompt_name, 
#             convert_to_numpy=self.config.convert_to_numpy,
#             convert_to_tensor=self.config.convert_to_tensor
#         )
#         return embeddings

# def load_embedding_model(config_path: str):
#     with open(config_path, "r") as f:
#         config_dict = json.load(f)
    
#     try:
#         config = SentenceEmbeddingConfig(**config_dict)
#     except TypeError as e:
#         raise ValueError(f"Invalid configuration keys: {e}.")

#     model = SentenceEmbeddingModel(config)
#     return model

In [13]:
config_path = "../config/e5_large_instruct_config.json"

print("Loading E5 Large model...")
embedding_model = load_embedding_model(config_path)
print(f"E5 Large model loaded successfully on {embedding_model.device}!")

Loading E5 Large model...
E5 Large model loaded successfully on cuda!


In [20]:
df = pd.read_csv('../data/cleaned_test.csv')
unique_classes = df['class'].unique().tolist()

item_embeddings = embedding_model.get_embeddings(df['cleaned_text'].tolist(), "query")
class_embeddings = embedding_model.get_embeddings(unique_classes)

# Convert to lists for CSV storage
item_emb_list = item_embeddings.float().cpu().numpy().tolist()
class_emb_list = class_embeddings.float().cpu().numpy().tolist()

# Create class embedding mapping
class_emb_dict = dict(zip(unique_classes, class_emb_list))

df['item_embedding'] = item_emb_list
df['class_embedding'] = [class_emb_dict[cls] for cls in df['class']]

# Create separate columns for unique classes (only for first N rows where N = number of unique classes)
df['unique_classes'] = None
df['unique_classes_embeddings'] = None

# Fill only the first N rows with unique class data
for i, (cls, emb) in enumerate(zip(unique_classes, class_emb_list)):
    if i < len(df):
        df.at[i, 'unique_classes'] = cls
        df.at[i, 'unique_classes_embeddings'] = str(emb)  # Convert list to string for CSV storage

df.to_csv('../results/embeddings.csv', index=False)
print(f"Embeddings saved with {len(df)} rows and {len(unique_classes)} unique classes")

Embeddings saved with 4546 rows and 32 unique classes


In [25]:
load_dotenv('../.env')

TD_HOST = os.getenv('TD_HOST')
TD_USER = os.getenv('TD_USER')
TD_PASS = os.getenv('TD_PASS')
TD_DB = os.getenv('TD_DB')

In [29]:
conn = teradatasql.connect(
    host=TD_HOST,
    user=TD_USER,
    password=TD_PASS,
    database=TD_DB
)

print("Successfully connected to Teradata!")

Successfully connected to Teradata!


In [35]:
query = f"SELECT TableName FROM DBC.TablesV WHERE DatabaseName = '{TD_DB}' AND TableKind = 'T' ORDER BY TableName"

cursor = conn.cursor()
cursor.execute(query)

print(f"\nTables in {TD_DB}:")
print("-" * 30)
for i, (table_name,) in enumerate(cursor.fetchall(), 1):
    print(f"{i}. {table_name}")


Tables in DEMO_USER:
------------------------------
1. gpc_labels_fc
2. train_data
3. train_embeddings_fc


In [36]:
cursor.execute(f"SELECT TOP 1 * FROM {TD_DB}.train_embeddings_fc")
table_columns = [desc[0] for desc in cursor.description]
print("Table columns:", table_columns)

Table columns: ['row_id', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v10', 'v11', 'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20', 'v21', 'v22', 'v23', 'v24', 'v25', 'v26', 'v27', 'v28', 'v29', 'v30', 'v31', 'v32', 'v33', 'v34', 'v35', 'v36', 'v37', 'v38', 'v39', 'v40', 'v41', 'v42', 'v43', 'v44', 'v45', 'v46', 'v47', 'v48', 'v49', 'v50', 'v51', 'v52', 'v53', 'v54', 'v55', 'v56', 'v57', 'v58', 'v59', 'v60', 'v61', 'v62', 'v63', 'v64', 'v65', 'v66', 'v67', 'v68', 'v69', 'v70', 'v71', 'v72', 'v73', 'v74', 'v75', 'v76', 'v77', 'v78', 'v79', 'v80', 'v81', 'v82', 'v83', 'v84', 'v85', 'v86', 'v87', 'v88', 'v89', 'v90', 'v91', 'v92', 'v93', 'v94', 'v95', 'v96', 'v97', 'v98', 'v99', 'v100', 'v101', 'v102', 'v103', 'v104', 'v105', 'v106', 'v107', 'v108', 'v109', 'v110', 'v111', 'v112', 'v113', 'v114', 'v115', 'v116', 'v117', 'v118', 'v119', 'v120', 'v121', 'v122', 'v123', 'v124', 'v125', 'v126', 'v127', 'v128', 'v129', 'v130', 'v131', 'v132', 'v133', 'v134', 'v135', 'v