In [2]:


## To run Hugging Face OpenSource models
# Needs to manually install Visual C++ Tools from: https://visualstudio.microsoft.com/visual-cpp-build-tools/

import warnings
from rich import print
import re
from nltk.stem import WordNetLemmatizer
from transformers import MarianMTModel, MarianTokenizer
import yake
from textblob import TextBlob
from nltk.corpus import wordnet as wn
from nltk.corpus import words, stopwords
import spacy

# Download necessary NLTK data (run this only once)
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

# Suppress all warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Set up YAKE keyword extractor
yake_extractor = yake.KeywordExtractor()

language = "en"
max_ngram_size = 3
deduplication_threshold = 0.1
numOfKeywords = 20
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)

# Load the translation model (use Spanish-to-English model)
model_name = 'Helsinki-NLP/opus-mt-es-en'  # Spanish to English model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
# Get the English words corpus and English stopwords
english_words = set(words.words())
english_stopwords = set(stopwords.words('english'))
# Add your custom stopwords
additional_stopwords = {
    # Add your custom stopwords here (same as your provided list)
    'hs', 'code', 'hscode', 'hs-code', 'hs  code', 'pallets', 'plts', 'shipper', 'declares', 'hs code',
    'containing', 'contains', 'meter', 'cubic', 'packages', 'load', 'loaded', 'weight', 
    'netweight', 'kg', 'kgs', 'cb', 'cbm', 'goods', 'parts', 'pieces', 'accessories', 'packing', 
    'declared', 'dangerous', 'impression', 'items', 'sheets', 'codes', 
    'sin', 'impresion', 'containers', 'pc', 'abv', 'net', 'gross', 'cif', 'aduana', 'customs', 
    'value', 'tax', 'duty', 'freight', 'port', 'terminal', 'consignee', 'consignor', 'invoice', 
    'manifest', 'quantity', 'description', 'volume', 'packaging', 'shipment', 'delivery', 'origin', 
    'destination', 'transport', 'carrier', 'export', 'import', 'tariff', 'item', 'declaration', 
    'clearance', 'documentation', 'commercial', 'charge', 'fees', 'logistics', 'shipping', 
    'container', 'unit', 'measurement', 'certification', 'palletized', 'metric', 'commodity', 
    'classification', 'entry', 'exportation', 'importation', 'bonded', 'zone', 'trade', 'license', 'bottle', 'bottles', 'cl',
    'ancho', 'largo', 'mm', 'pcs', 'xhc', 'stc', 'uks','x','k', 'pty', 'id', 'cp', 'ncm', 'ne', 'itpa', 'zz', 'xg', 'topmag',
    'rtmx', 'fcl', 'cf','f', 'xdc', 'pkgs', 'voice', 'n', 'per', 'email', 'phone', 'fax', 'tax', 'id', 'sms', 
    'tel', 'mobile', 'cell', 'cellular', 'mobile', 'mobiles', 'telephone', 'fax', 'email', 'mail', 'mails', 'email', 
    'emails', 'faxes', 'emails', 'fax', 'faxes', 'phones', 'phone', 'tel', 'tels', 'telephone', 'telephones', 'cell', 
    'cellular', 'cellulars', 'mobile', 'mobiles', 'prepaid'
}

stop_words = english_stopwords.union(additional_stopwords)

# Load the English language model
nlp = spacy.load("en_core_web_sm")

def lemmatize_translate_clean(text):
    keywords = text.split()
    valid_keywords = [keyword for keyword in keywords if keyword not in additional_stopwords]
    lematized_keywords = [lemmatizer.lemmatize(keyword) for keyword in valid_keywords]

    ### Lemmatize the extracted keywords
    ##lemmatized_keywords = [lemmatizer.lemmatize(keyword[0]) for keyword in keywords]
    lemmatized_sentence = " ".join(lematized_keywords)

    ### Correct typos in the lemmatized keywords
    ##corrected_keywords = [str(TextBlob(keyword).correct()) for keyword in lemmatized_keywords]
    ##lematized_and_corrected_typos_sentence = " ".join(corrected_keywords)

    ##lemmatize_translate_clean_text = []
    ##flattened_items = [word for item in lemmatized_keywords for word in item.split()]
    ###print(flattened_items)
    ##for item in flattened_items:
    ##    lemma = lemmatizer.lemmatize(item)
    ##    #Removing condition to check if the word exists on english dictionary, cartulin was not there for example
    ##    #if lemma in english_words and lemma not in english_stopwords:
    ##    if lemma in lemma not in english_stopwords:            
    ##        lemmatize_translate_clean_text.append(lemma)
    #print(english_existing_words)
    return lemmatized_sentence


# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    # Remove punctuation and numbers
    text = re.sub(r'[\d]+|[^\w\s]', '', text)  # Remove numbers and punctuation
    return text.strip()

# Function to translate text to English
def translate_to_english(text, tokenizer, model):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")
    # Generate translation
    translated = model.generate(**inputs)
    # Decode and return the translated text
    return tokenizer.decode(translated[0], skip_special_tokens=True)

def get_synonyms(term):
    synonyms = set()
    
    # Get synsets for the word
    for synset in wn.synsets(term):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))  # Replace underscores with spaces for readability
    return synonyms

def extract_nouns(phrase):
    doc = nlp(phrase)  # Process the input phrase
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]  # Extract nouns
    print("Nouns are:", nouns)
    return nouns

def extract_adjectives(phrase):
    # Process the phrase using spaCy
    doc = nlp(phrase)
    # Extract words that are adjectives (POS tag 'ADJ')
    adjectives = [token.text for token in doc if token.pos_ == 'ADJ']
    print("Adjectives are:", adjectives)
    return adjectives

# Full pipeline function
def process_text(text):
    try:
        # Step 1: Preprocess
        preprocessed_text = preprocess_text(text)
        
        # Step 2: Translate
        translated_text = translate_to_english(preprocessed_text, tokenizer, model)
        
        # Step 3: Lemmatize, remove nonsense words, and clean
        #final_cleaned_text = lemmatize_and_clean(translated_text)
        final_cleaned_text = lemmatize_translate_clean(translated_text)

        #extract_nouns(" ".join(final_cleaned_text))
        #extract_adjectives(" ".join(final_cleaned_text))
        return final_cleaned_text
        
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""
    
# Example usage
#input_text = "Este es un texto en español que queremos traducir y limpiar."
#input_text = "said contain plastic drum h msku dry shipper seal pw plastic drum h stowed methyl butyric acid corrosive liqu"
#input_text = "TUBOS 7306.3011 TUBOS Y ACCESORIOS .1990-7307.9980 RAMPAS .9000 TAPON  TUERCA DE LATON .3300 ACCESORIOS DE TUBERIA DE ALUMINIO .0000"
#input_text = "xhr said contain package sudu reef shipper seal kn package total versatis mg parches lot r temperature must main taine"
#input_text = "containerstotal boxesporcelain tile glazed size exp dt bill dt wt kgsfreight collectthis master bill"
#input_text = "CARTULINA NEGRA BOB 170 GRS HS CODE 480258 CARTULINA BLANCA BOB 210 GRS HS CODE 480258"
#input_text = "GYM EQUIPMENT AS PER COMMERCIAL INVOICE N VXUSD210182HS CODE  950691FREIGHT PREPAIDGYM EQUIPMENT AS PER COMMERCIAL INVOICE N VXUSD210182HS CODE  95069"
#input_text = "HERRAJES PARA MUEBLES HS CODE 83024200 EMAIL G.GIACOMO RAGO-GROUP.COM"
input_text = "DISPOSABLE MEDICAL FACE MASKN AHS CODE  630790 FAX EMAIL TAX ID 9144010178376732X5  TAX ID  SME980702916 T   52  55 4334 7000EXT 7043PHONE  FAX EMAIL  JONATHAN LIEDOT BOLLORE COMFREIGHT PREPAID"
cleaned_output = process_text(input_text)
print(cleaned_output)    

### Extract nouns and search synonyms

In [None]:
import nltk
from nltk.corpus import wordnet as wn
import spacy

# Download necessary NLTK data (only needed once)
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to fetch synonyms from WordNet
def get_synonyms(term):
    synonyms = set()
    
    # Get synsets for the word
    for synset in wn.synsets(term):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))  # Replace underscores with spaces for readability
    return synonyms

def extract_nouns(phrase):
    doc = nlp(phrase)  # Process the input phrase
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]  # Extract nouns
    return nouns

input_text = "CARTULINA NEGRA BOB 170 GRS HS CODE 480258 CARTULINA BLANCA BOB 210 GRS HS CODE 480258"
cleaned_output = process_text(input_text)
nouns = extract_nouns(cleaned_output)

print(f"Nouns extracted from '{cleaned_output}': {nouns}")