# Import packages

In [2]:
import pandas as pd
from teradataml import *
import ast
import json
import torch
from teradataml.dataframe.copy_to import copy_to_sql
from transformers.generation import GenerationMixin

from modules.db import TeradataDatabase
from modules.models import( 
    OpusTranslationModelConfig, 
                        OpusTranslationModel, 
                        SentenceEmbeddingConfig, 
                        SentenceEmbeddingModel, 
)
from utils import clean_text, load_embedding_model, unicode_clean, load_translation_model
from constants import (
    CLEANED_TEST_DATA_PATH, TRAIN_VAL_DATA_PATH, CLASS_EMBEDDINGS_PATH, PRODUCT_TEST_EMBEDDINGS_PATH, 
    CLEANED_GPC_PATH, CLEANED_TEST_DATA_PATH, TEST_DATA_PATH, E5_LARGE_INSTRUCT_CONFIG_PATH, 
    OPUS_TRANSLATION_CONFIG_PATH, DATA_PATH, PRODUCT_TRAIN_EMBEDDINGS_PATH, QWEN3_8B_CONFIG_PATH, VALIDATION_DATA_PATH,
    FULL_DATA_SET_DATA_PATH, PRODUCT_FULL_DATASET_EMBEDDINGS_PATH,  CLEANED_FULL_DATASET_DATA_PATH,
    CLASS_EMBEDDINGS_PATH_QWEN, PRODUCT_FULL_DATASET_EMBEDDINGS__QWEN_PATH
)


## Connect to database

In [3]:
td_db = TeradataDatabase()
td_db.connect()

In [None]:
tdf = td_db.execute_query("Select * from demo_user.gpc_orig")
df = pd.DataFrame(tdf)


In [5]:
df.head()

Unnamed: 0,id,SegmentCode,SegmentTitle,SegmentDefinition,FamilyCode,FamilyTitle,FamilyDefinition,ClassCode,ClassTitle,ClassDefinition,BrickCode,BrickTitle,BrickDefinition_Includes,BrickDefinition_Excludes,AttributeCode,AttributeTitle,AttributeDefinition,AttributeValueCode,AttributeValueTitle,AttributeValueDefinition
0,13392,94000000,Crops,"Crops of cultivated plants, being seedbearing ...",94020000,Crops for Food Production,Crops primarily cultivated for the production ...,94021300,Crops for Production of Edible Fruit from Sapo...,Cultivated crops for the production of edible ...,10007139,Mamey Sapotetrees (Pouteria sapota),Includes any product that can be described/obs...,Specifically excludes Mamey Sapotetrees cultiv...,20003029.0,Crop Production Purpose,This particular cultivated crop will be grown ...,30017725.0,NURTURING,
1,4607,83000000,Building Products,,83010000,Building Products,,83011900,Fixings/Fasteners Hardware,,10003186,Brackets/Braces,Includes any products that may be described/ob...,Excludes products such as Mountings as well as...,20002469.0,If Heavy Duty,"Indicate, with reference to the product brandi...",30002960.0,NO,
2,7390,47000000,Cleaning/Hygiene Products,All products that are used to clean or improve...,47200000,Cleaning/Hygiene Supplies,,47200200,Cleaning Aids,,10000397,Cleaning Aids Accessories,Includes any products that can be described/ob...,Excludes products such as all homecare electri...,20001709.0,Target Use/Application,"Indicates, with reference to the product brand...",30005002.0,METAL - SILVER,
3,0,70000000,Arts/Crafts/Needlework,All products that are used to any creative act...,70010000,Arts/Crafts/Needlework Supplies,All products that are used to any creative act...,70010100,Artists Painting/Drawing Supplies,Includes any products that may be described/ob...,10001674,Artists Brushes/Applicators,Includes any products that can be described/ob...,Specifically excludes Household Paint Brushes ...,20001337.0,Type of Artists Brush/Applicator,"Indicates, with reference to the product brand...",30008491.0,ARTISTS BLENDER/FINISHING BRUSH,
4,13393,94000000,Crops,"Crops of cultivated plants, being seedbearing ...",94020000,Crops for Food Production,Crops primarily cultivated for the production ...,94021300,Crops for Production of Edible Fruit from Sapo...,Cultivated crops for the production of edible ...,10007139,Mamey Sapotetrees (Pouteria sapota),Includes any product that can be described/obs...,Specifically excludes Mamey Sapotetrees cultiv...,20003029.0,Crop Production Purpose,This particular cultivated crop will be grown ...,30017724.0,PHARMACEUTICAL,


### Remvoing un-neccesary columns

In [None]:
query = """
ALTER TABLE demo_user.gpc_orig 
DROP SegmentCode,
DROP FamilyCode,
DROP ClassCode,
Drop BrickCode,
DROP AttributeCode,
DROP AttributeTitle,
DROP AttributeDefinition,
DROP AttributeValueCode,
DROP AttributeValueTitle,
DROP AttributeValueDefinition;
"""

In [11]:
tdf = td_db.execute_query(query)

### Cleaning the GPC Columns

In [12]:
cleaning_query = """
UPDATE demo_user.gpc_orig
SET SegmentTitle = LOWER(TRIM(
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(SegmentTitle, '[[:digit:]]+', ''), 
                              '[-_/\\|]', ' '),                              
                            '[[:punct:]]', ' '))),
    SegmentDefinition = LOWER(TRIM(
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(SegmentDefinition, '[[:digit:]]+', ''), 
                              '[-_/\\|]', ' '),                              
                            '[[:punct:]]', ' '))),
    FamilyTitle = LOWER(TRIM(
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(FamilyTitle, '[[:digit:]]+', ''), 
                              '[-_/\\|]', ' '),                              
                            '[[:punct:]]', ' '))),
    FamilyDefinition = LOWER(TRIM(
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(FamilyDefinition, '[[:digit:]]+', ''), 
                              '[-_/\\|]', ' '),                              
                            '[[:punct:]]', ' '))),
    ClassTitle = LOWER(TRIM(
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(ClassTitle, '[[:digit:]]+', ''), 
                              '[-_/\\|]', ' '),                              
                            '[[:punct:]]', ' '))),
    ClassDefinition = LOWER(TRIM(
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(ClassDefinition, '[[:digit:]]+', ''), 
                              '[-_/\\|]', ' '),                              
                            '[[:punct:]]', ' '))),
    BrickTitle = LOWER(TRIM(
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(BrickTitle, '[[:digit:]]+', ''), 
                              '[-_/\\|]', ' '),                              
                            '[[:punct:]]', ' '))),
    BrickDefinition_Includes = LOWER(TRIM(
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(BrickDefinition_Includes, '[[:digit:]]+', ''), 
                              '[-_/\\|]', ' '),                              
                            '[[:punct:]]', ' '))),
    BrickDefinition_Excludes = LOWER(TRIM(
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(BrickDefinition_Excludes, '[[:digit:]]+', ''), 
                              '[-_/\\|]', ' '),                              
                            '[[:punct:]]', ' ')));

"""

In [13]:
tdf = td_db.execute_query(cleaning_query)

## Disconnect

In [None]:
td_db.disconnect()