In [1]:
import pandas as pd
import numpy as pp
from sklearn.utils import shuffle
import multiprocessing
from functools import partial
import string
from joblib import Parallel, delayed
import stanza
import copy
from spacy.lang.en import stop_words as stop_words
import nltk
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
import os
from collections import Counter
from textblob import TextBlob


### Context Similarity & Embedding Extraction (Unsupervised Approach)

In this step, we apply an **unsupervised approach** to evaluate and assign the most relevant labels to each company, without relying on external texts in order to help with the classification.

We extract the **context similarity matrix** using `SentenceTransformer` for each company columns(except sector). Additionally, we compute the **vector embeddings** of the most important tokens from each company — selected through various methods, which we will explore further.

We also generate **label embeddings** for all taxonomy labels, filtering out less relevant terms (eg .services) and keeping only the most representative ones. For each label, we also extract and embed the **strongest word** which has the strongest corelation to any company.

These embeddings allow us to compare companies with multiple candidate labels and use the strongest words as a **tie-breaking criterion**, especially when label scores are close or ambiguous.

All of this logic is part of a broader workflow and is used within another program called **`classifier2_continuation`**, where we continue the classification process using these extracted signals.

Here we are removing each file, to not not do it ourselves manually

In [2]:
if os.path.exists("context_matrix.txt"):
    os.remove("context_matrix.txt")
if os.path.exists("context_sentence_embedding.txt"):
    os.remove("context_sentence_embedding.txt")
if os.path.exists("label_embeddings.txt"):
    os.remove("label_embeddings.txt")
if os.path.exists("vec_embeddings.txt"):
    os.remove("vec_embeddings.txt")
if os.path.exists("tokens_vector.csv"):
    os.remove("tokens_vector.csv")
if os.path.exists("new_label_with_categories"):
    os.remove("new_label_with_categories")
if os.path.exists("most_specific_term.csv"):
    os.remove("most_specific_term.csv")
if os.path.exists("match_words_with_tf_idf.txt"):
    os.remove("match_words_with_tf_idf.txt")
if os.path.exists("counted_elements.csv"):
    os.remove("counted_elements.csv")
if os.path.exists("sectors.csv"):
    os.remove("sectors.csv")

**Company Data**

This is the dataset containing the **company information** that we will use throughout the project for classification.

In [3]:
df = pd.read_csv("../inputs/ml_insurance_challenge.csv")
num_cores = multiprocessing.cpu_count()

### Curated ConceptNet Embeddings with Numberbatch

We use **Numberbatch** from **ConceptNet** to help generate word embeddings. The original Numberbatch file was approximately **20GB**, which made it extremely slow to load. Additionally, it was difficult to determine if a term came from a specific language.

To improve both **performance** and **accuracy**, we created a **curated version** of Numberbatch. In this version, we extracted only the **foreign terms relevant to company titles**, allowing us to better match multilingual input.

This helped us achieve **higher precision** when assigning labels to companies based on their descriptions.

In [4]:
from common_functions_f import generate_noun_for_adj, open_filtered_assertions_file, generate_embedings_index

In [5]:
import numpy as np
embeddings_index =  generate_embedings_index()

**New Columns for Tokenization**

These are the **new columns** we generated and prepared specifically for the **tokenization process**.  
They combine or transform existing fields (e.g., `description`, `business_tags`, `category` etc.) to provide a cleaner and more consistent input for the NLP tasks.

In [6]:

df["description"] = df["description"].fillna("")
df["sector"] = df["sector"].fillna("")
df["niche"] = df["niche"].fillna("")
df["category"] = df["category"].fillna("")
df['new_col'] = df['sector']
df['new_col2'] = df['sector']
df2 = copy.deepcopy(df)

In [7]:
import re

### Handling CamelCase Words

Many entries in the dataset—especially **company names**—contain **CamelCase formatting** (e.g., `SeniorService`).  
To improve clarity and ensure better **tokenization and semantic analysis**, we decided to **split CamelCase words into separate tokens** (e.g., `Senior Service`).

This helps processes like embedding, similarity comparison, and label matching to perform more accurately.

In [8]:
def camel_case_split(s):
    if s.isupper():
        return s
    s=re.sub(r'[^\w\s]',' ',s)

    
   
    modified_string = list(map(lambda x: '_' + x if x.isupper() else x, s))
    split_string = ''.join(modified_string).lower().split('_')
    split_string = " ".join(list(filter(lambda x: x != '' and x!="-" and x!="_", split_string)))
    return split_string
def camel_case_split2(s):
    if s.isupper():
        return s
    if "-" in s or "_" in s:
        return s.lower()
    modified_string = list(map(lambda x: '_' + x if x.isupper() else x, s))
    split_string = ''.join(modified_string).lower().split('_')
    split_string = " ".join(list(filter(lambda x: x != '' and x!="-" and x!="_", split_string)))
    return split_string

**Acronym Handling**

Some **acronyms** in our dataset were found to be **semantically ambiguous** when compared using Numberbatch embeddings.  
To address this, we tested the **cosine similarity** between each acronym and its corresponding full form. When the similarity score was low — indicating a mismatch — we replaced the acronym with its **expanded version**, especially for terms that appeared frequently.  
This helped improve embedding accuracy and reduced the risk of misclassification during label assignment.

In [9]:
acronym_dict = {
    "IT": "information_technology",
    "ID": "identity",
    "PP": "polypropylene",
    "CBD": "cannabidiol",
    "covid" : "coronavirus",
    "COVID": "coronavirus",
    "Covid": "coronavirus",
    "Covid-19": "coronavirus",
    "COVID-19": "coronavirus",
    "covid-19": "coronavirus"



}

def replace_upper_word(words, acronym_dict):
    return (list(map(lambda x: acronym_dict[x.replace("'","")] if x.replace("'","") in acronym_dict.keys() else x, words)))

**Text Normalization**

As part of our preprocessing, we cleaned individual words by removing unwanted leading or trailing characters.  
For example, a word like `"'paper"` was normalized to `"paper"` to ensure consistency and improve token matching.

In [10]:
def replace_word(text):

    if len(text) <= 1:
        return text
    if text[0] == '-' or text[0]=="'":
        text = text[1:]
    if text[-1] == "-" or text[-1]=="'":
        text = text[:-1]

    if "'s" not in text and "'" in text:
        text = text.replace("'","")
    return text

**Acronym Replacement**

To avoid confusion during tokenization and embedding, we replaced acronyms using a predefined dictionary of long forms.  
This ensures that terms are represented consistently and accurately, especially when the acronyms could be misunderstood or poorly represented in the embedding space.

In [11]:

def replace_upper_word_tags(words, acronym_dict):
    our_list = []
    for i in range(len(words)):
        our_list+=(list(map(lambda x: acronym_dict[replace_word(x)] if replace_word(x) in acronym_dict.keys() else replace_word(x).lower(), words[i].split(" "))))
        if i != len(words)-1:
            our_list+=","
    return " ".join(our_list)

In [12]:
def replace_sector_word(word):
    return word.lower().replace(" ", "_")

In [13]:
def check_punctuation(word):
    return all(char in string.punctuation or char == " " for char in word)

### Data Preprocessing Strategy

To avoid issues during the NLP processing phase, we made a few important preprocessing decisions.

- **Business tags** were left as they are, without splitting the terms, so as not to interfere with the NLP model.
- For the other columns, we split the terms and **merged the entire text content** from all relevant fields into one combined string per company. This approach is more efficient than running NLP on each column separately, which proved to be slower.

We defined **five different processing cases**, one of which is the **company title**. The title is usually found before keywords like `"is"`, `"in"`, or `"was"` in the description. If no such keyword exists, we simply extract the **first five words** as the company name. The company title is given a **high weight**, as it often strongly indicates the company’s category — and we found this to be especially helpful.

This is also why we included **foreign words** in the embedding vocabulary — for instance, words like `"Pintura"` (Spanish for "painting") often appear in company names and carry important meaning for classification.

We treated the rest of the fields — **company description, business tags, category, and sector** — as separate components in our embedding and NLP pipeline.

Additionally:
- We split **description** and **business tags** into lists to ensure no string exceeds the character limits imposed by our NLP model (e.g., < 1,000,000 characters).
- We chose **`;`** as a separator between different text segments. This worked significantly better than other options like `"."` or `","`, especially when using string index functions and during **lemmatization**, where `";"` proved to be reliable for segmenting content clearly.

In [14]:
relevant_pct = string.punctuation.replace(",", "").replace("_", "").replace("-", "").replace("'", "")
df2=df.copy()
 
for idx, _ in df2.iterrows():
    df2.at[idx, 'business_tags'] = "".join(df2.at[idx, 'business_tags'].replace("/", " ")).translate(str.maketrans('', '', relevant_pct)).lower().split("'")
    df2.at[idx, 'business_tags'] = list(filter(lambda x: x if x not in string.punctuation and not check_punctuation(x) else "", df2.at[idx, 'business_tags']))

In [15]:
df_old_copy = df.copy()


In [16]:
import math
str_company_titles = ""
str_company_description_elements = ""
index_until_company_description_ends = []

str_list_company_description_elements = []


list_sectors_elements = df['sector'].to_list()
set_sectors_elemenets = set(list_sectors_elements)
dict_sectors_elements = {i:list_sectors_elements.count(i) for i in set(list_sectors_elements)}
dict_sectors_elements2 = set(filter(lambda x: dict_sectors_elements[x]/len(df['description']) <= 0.2, dict_sectors_elements))

str_company_business_tags_elements = ""
str_list_company_business_tags_elements = []
index_until_business_tags_elements = []

str_company_category_elements = ""
index_until_category_elements = []
rest_comany_description_complete = ""

str_company_niche_elements = ""

title_company_complete = ""
index_until_niche_elements = []
all_punctuation_except_hyphen = string.punctuation.replace("-","").replace("'","").replace("’","").replace(",", "").replace(".","").replace("/", "")
colums = [('description',0), ('business_tags',1), ('category',2), ('niche',3)]
for col, idx in colums:
    df[col] = df[col].str.replace('[{}]'.format(all_punctuation_except_hyphen), '', regex=True)

    if(col!='business_tags'):
        df[col] = df[col].str.split()

df['title_description'] = df['description']

title_company_complete_full = ""
z=0

dict_description = {}
dict_tags = {}
for idx, rows in df.iterrows():
    z+=1

    df_old_copy.loc[idx, 'sector'] = '' if df.loc[idx, 'sector'] not in dict_sectors_elements2 else replace_sector_word(df.loc[idx,'sector'])



    business_tags = rows['business_tags'].split("' '")

    business_tags_text = replace_upper_word_tags(business_tags, acronym_dict)
    category = rows['category']

    category_text = replace_upper_word(category, acronym_dict)

    niche = rows['niche']
    niche_text = replace_upper_word(niche, acronym_dict)

    words_splitted = rows['description']
    
    max_index = -1
    if "is" in  words_splitted:
        max_index = words_splitted.index("is")
    if "in" in words_splitted and max_index == -1:
        max_index = words_splitted.index("in")
    if "was" in words_splitted and max_index == -1:
        max_index = words_splitted.index("was")

    if max_index==-1:
        max_index = 5

    if max_index > 10:
        max_index = 10   
    

    words_splitted_List = list(map(lambda x: camel_case_split(x), words_splitted[:max_index]))
    words_splitted_rest_list = list(map(lambda x: camel_case_split2(x), words_splitted[max_index:]))

    company_title =  " ".join(words_splitted[:max_index])

    title_company_complete = " ".join(words_splitted_List)

    title_company_complete_full += title_company_complete+'; '
    

    str_company_titles+=company_title.replace("/", " ")+'. '

    business_tags_text = business_tags_text.replace("-", "_").replace(" _ ", "").replace("/", " ") + '; '
    

    if len(str_company_business_tags_elements)+len(business_tags_text.replace("/", " ")) > 1000000:
        str_list_company_business_tags_elements.append(str_company_business_tags_elements)
        str_company_business_tags_elements = business_tags_text.replace("/", " ")
    else:
        str_company_business_tags_elements += business_tags_text.replace("/", " ")


    niche_text = (" ".join(niche_text)).replace("-", "_").replace("'","").replace('"',"").replace(" _ ", "").lower() + '; '
    str_company_niche_elements += niche_text.replace("/", " ")

    category_text = (" ".join(category_text)).replace("-", "_").replace("'","").replace('"',"").replace(" _ ", "").lower() + '; '
    str_company_category_elements += category_text.replace("/", " ")

    

    word_rest_list_revised = replace_upper_word(words_splitted_rest_list, acronym_dict)

    rest_company_description = " ".join(word_rest_list_revised).replace("-", "_").replace("'","").replace('"',"").replace(" _ ", "").lower()+'; '
    
    
    if len(str_company_description_elements)+len(rest_company_description.replace("/", " ")) > 1000000:
        str_list_company_description_elements.append(str_company_description_elements)
        str_company_description_elements = rest_company_description.replace("/", " ")
    else:
        str_company_description_elements += rest_company_description.replace("/", " ")


str_list_company_description_elements.append(str_company_description_elements)
str_list_company_business_tags_elements.append(str_company_business_tags_elements)


### Global Token Relevance Filtering with Entity Detection and Label Matching

In this step, we expand the relevance check to include **all tokens**, not just those associated with a specific label.

#### Entity Detection with English Model:
We use spaCy’s **English language model** (`en_core_web_lg`) to process the text and identify named entities and parts of speech across the entire input.

#### Relevance Filtering:
To ensure that only meaningful tokens are kept:
- We compare **each token from the title** against the label embeddings using **cosine similarity**.
- If a token has a **similarity score higher than 0.35** to any label, we **consider it relevant** and keep it.
- Tokens that don't meet this threshold are discarded to reduce noise.

This filtering helps us isolate only the **most relevant terms**, which are likely to carry strong semantic meaning for classification.

#### Weighting Important Terms:
Tokens that pass the relevance threshold are considered **crucial features**, and we apply an increased weighting of **×4** to them during scoring and embedding aggregation. This gives them greater influence in the final label selection process.

In [17]:
import numpy as np
import spacy
nlp = spacy.load('en_core_web_lg') 

In [18]:
nlp_niche = nlp(str_company_niche_elements)
nlp_category = nlp(str_company_category_elements)

nlp_business_tags = []
nlp_description = []

In [19]:
nlp_description = [None] * len(str_list_company_description_elements)
nlp_business_tags = [None] * len(str_list_company_business_tags_elements)


**Global Variables for Parallelization**

We use **global variables** in our implementation to support **parallelization**, allowing multiple processes to access shared data efficiently.  
This design choice helps us significantly **speed up computation**

In [20]:
import threading

def get_nlp(no, number):
    if no == 0:
        global nlp_business_tags
        nlp_business_tags[number] = nlp(str_list_company_business_tags_elements[number])
    else:
        global nlp_description
        nlp_description[number] = nlp(str_list_company_description_elements[number])
    return None
        

In [21]:
threads = []
for i in range(len(str_list_company_description_elements)):
    thread = threading.Thread(target = get_nlp, args=(1,i))
    threads.append(thread)
    thread.start()

for i in range(len(str_list_company_business_tags_elements)):
    thread = threading.Thread(target = get_nlp, args=(0,i))
    threads.append(thread)
    
    thread.start()


for thread in threads:
    thread.join()



### Index Tracking for Business Tags Tokenization

We track the **indexes marking the end of each company's `business_tags`** in order to properly manage how we tokenize them later. Alongside that, we maintain the index of the `text_nlp_business_tags` list, which contains the preprocessed NLP tokens for all business tags.

This indexing system helps us during the **tokenization step for each column**, especially when we need to segment the input correctly per company.

Whenever we reach a new company index in our list, we:
- Move to the corresponding entry in the `text_nlp_business_tags` list
- **Reset the start and end values**
- Repeat the process until we have iterated through all tags

This logic ensures that each company’s business tags are correctly aligned with their corresponding tokenized representation. This logic is applied also to the nlp_description, and it is simpler for category, niche and title.

In [22]:
j=0


text_nlp_business_tags = [el.lemma_ for el in nlp_business_tags[j]]

start=0
end_index = text_nlp_business_tags[start:].index(";")
text_list=[]


index_end_business_tags = []
index_group_business_tags = []
list_nlp_business_tags = []
z=0
s=-1
while j<len(nlp_business_tags):
    text_list=[]
    index_end_business_tags.append(end_index)
    index_group_business_tags.append(j)
    
    if end_index+1 >= len(nlp_business_tags[j]):
        j+=1
        start=0
        if j < len(nlp_business_tags):
            text_nlp_business_tags = [el.text for el in nlp_business_tags[j]]
            end_index=text_nlp_business_tags[start:].index(";")+1        
    else:
        start=end_index+1
        end_index+= text_nlp_business_tags[start:].index(";")+1
    z+=1


In [23]:
text_nlp_category = [el.text for el in nlp_category]

start=0
end_index = text_nlp_category[start:].index(";")+1
text_list=[]


index_end_category = []
list_nlp_cateogry = []
z=1
s=-1
while z<len(df['category']):
    text_list=[]
    index_end_category.append(end_index)    
    start=end_index
    end_index+= text_nlp_category[start:].index(";")+1
    z+=1
index_end_category.append(end_index)    


In [24]:
j=0


text_nlp_niche = [el.text for el in nlp_niche]

start=0
end_index = text_nlp_niche[start:].index(";")+1
text_list=[]


index_end_niche = []
list_nlp_niche = []
z=1
s=-1

while z<len(df['niche']):
    text_list=[]
    index_end_niche.append(end_index)    
    start=end_index
    end_index+= text_nlp_niche[start:].index(";")+1
    z+=1
index_end_niche.append(end_index)    



In [25]:
j=0


text_nlp_description = [el.text for el in nlp_description[j]]

start=0
end_index = text_nlp_description[start:].index(";")
text_list=[]


index_end_description = []
index_group_description = []
list_nlp_description = []
z=0
s=-1
while z<len(df['description']) and j<len(nlp_description):
    text_list=[]
    index_end_description.append(end_index)
    index_group_description.append(j)
    
    if end_index+1 >= len(nlp_description[j]):
        j+=1
        start=0
        if j < len(nlp_description):
            text_nlp_description = [el.text for el in nlp_description[j]]
            end_index= text_nlp_description[start:].index(";")+1


            
    else:
        start=end_index+1
        end_index+= text_nlp_description[start:].index(";")+1
    z+=1

This our function for generating ngrams, it checks if it is in our embeddings_index

In [26]:
def treat_composite_words(element):
    new_list = []
    for word in element:
        if word[0].count("_") > 0 and word[0] not in embeddings_index.keys():
            for word_split in word[0].split("_"):
                if len(word_split) > 1 and word_split in embeddings_index.keys() and word_split not in stop_words.STOP_WORDS:
                    new_list.append((word_split, word[1]))
        else:
            new_list.append(word)
    return new_list

In [27]:
from nltk.util import ngrams
 
def new_ngrams(element, n, dict_excl=None, dict_incl=None, row_no2=None):
    for our_grams in ngrams(element, n):
        
        is_only__containing_stop_words = all(val in stop_words.STOP_WORDS for val in our_grams)

        
        does_contain_irrevant_words = False
        if dict_excl != None and dict_incl!=None:
            does_contain_irrevant_words = any((val in dict_excl and val not in stop_words.STOP_WORDS and val not in dict_incl) for val in our_grams)
       
        if is_only__containing_stop_words:
            continue
        
        if does_contain_irrevant_words:
            continue

    
        element = "_".join(our_grams)
    
        if element not in embeddings_index.keys() or (dict_excl !=None and element in dict_excl):
            continue
    
        yield  element


       

### Label Tokenization and Cleaning Process

In this step, we perform **tokenization and preprocessing** on the label text to prepare it for further analysis and embedding.

#### Cleaning Steps:
- We remove **punctuation**, except for **hyphens (`_`)** and **apostrophes (`'`)** when they appear **within** words.
- Apostrophes are **removed** if they appear at the **start or end** of a word (e.g., `'data` → `data`).
- We also apply **lemmatization** to reduce words to their base forms (e.g., `"services"` → `"service"`).
- After that, we apply **n-gram generation** to capture important multi-word expressions.

#### TF-IDF Filtering:
Once tokenization is complete, we compute a **TF-IDF matrix** over the processed label terms.

To improve the quality of the label embeddings and reduce noise, we filter out **generic or low-signal terms**. Specifically, we eliminate terms that have a **TF-IDF score lower than 0.3**, as they are considered too common or uninformative for distinguishing between labels.


In [28]:
import nltk
from nltk.corpus import wordnet as wn
from pandarallel import pandarallel

our_classes = pd.read_csv("../inputs/insurance_taxonomy - insurance_taxonomy.csv")
our_classes_vector = our_classes['label']
pandarallel.initialize()

def our_classes_lemmitize(tokens):
   ok=0
   original_terms = tokens
   merged_token = " ".join(tokens).lower().replace("-", "_").replace(" _ ", " ")
   
   tokens = nlp(merged_token)


   
   element = list(map(lambda x: replace_word(x.text.lower()), tokens))
   elements_words_stop_new = stop_words.STOP_WORDS
   elements_words_stop_new.add("new")
   elements_words_stop_new.add("single")
   elements_words_stop_new.add("multi")

   if "well" in elements_words_stop_new:
      elements_words_stop_new.remove("well")
   

   tokens = []
   for x in element:
      
      if x not in elements_words_stop_new and x !="s":
         if x not in embeddings_index.keys():
            joined_word = "".join(x.split("_"))
            if joined_word not in embeddings_index.keys():
               for word_split in x.split("_"):
                  
                  if word_split not in elements_words_stop_new and word_split !="s" and word_split in embeddings_index.keys():
                     tokens.append(word_split)
            else:
               tokens.append(joined_word)
         else:
            tokens.append(x)
   element = tokens
          
   element = element +  list(new_ngrams(element,2)) +  list(new_ngrams(element,3))
   return element

INFO: Pandarallel will run on 11 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [29]:
all_punctuation_except_hyphen = string.punctuation.replace("-","").replace("'","").replace("’","").replace(",", "").replace(".","").replace("/", "")


In [30]:
our_classes['rows_number'] = range(0, len(our_classes['label']))

our_classes['label'] = our_classes['label'].replace('[{}]'.format(all_punctuation_except_hyphen), '', regex=True).str.split()
our_classes['label'] = [(list(filter(lambda x: x if ("'" not in x or (x[0]=="'" or x[-1]=="'") or "'s" in x) and (x.isnumeric() == False) else '', el))) for el in our_classes['label']] 

our_labels_final = our_classes.apply(lambda x: our_classes_lemmitize(x['label']),axis=1)
our_classes['new_label'] = our_labels_final




In [31]:
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
our_classes['new_label'] = [list(map(lambda x:x[2:] if len(x) > 2 and x[0:2]=="__" else x, label_word)) for label_word in our_classes['new_label']]
our_classes['new_label2'] = [Counter(elements_label) for elements_label in our_classes['new_label']]


v_label = DictVectorizer(sparse=True)
X_label = v_label.fit_transform(our_classes['new_label2'])
feature_names_label = v_label.get_feature_names_out()
X_csr_label = csr_matrix(X_label)
tfidf_transformer_label =TfidfTransformer(smooth_idf=True,use_idf=True)
tf_idf_label = tfidf_transformer_label.fit_transform(X_label)


In [32]:
match_label_with_tf_idf_valuess = {}
match_sum_with_tf_idf_valuess = {}
match_no_with_tf_idf_valuess = {}

for k in range(0, len(our_classes['new_label2'])):
    row_tfidf = X_label[k].toarray()[0]
    nonzero_indices = np.nonzero(row_tfidf)[0]
    g=0
    for j in nonzero_indices:
        g+=1
        a = round(tf_idf_label[k,j],5)    
        match_label_with_tf_idf_valuess[(feature_names_label[j], k)] = a

        if (feature_names_label[j], k) not in match_sum_with_tf_idf_valuess.keys():
            match_sum_with_tf_idf_valuess[feature_names_label[j]] = a
            match_no_with_tf_idf_valuess[feature_names_label[j]] = 1
        else:
            match_sum_with_tf_idf_valuess[feature_names_label[j]] +=a
            match_no_with_tf_idf_valuess[feature_names_label[j]] +=1



In [33]:
def get_plural_tf_idf_value(word, idx):
    if word+"s" in match_sum_with_tf_idf_valuess.keys():
        return match_sum_with_tf_idf_valuess[word+'s']/match_no_with_tf_idf_valuess[word+'s']
    elif word+"es" in match_label_with_tf_idf_valuess.keys():
        return match_sum_with_tf_idf_valuess[word+'es']//match_no_with_tf_idf_valuess[word+'es']
    return match_label_with_tf_idf_valuess[(word, idx)]

In [34]:
generic_core_terms = {
    "operations",
    'manufacturing',
    'application',
    'construction',
    'consulting',
    'planning',
    'processing',
    'development',
    'management',
    'installation',
    'maintenance'

}

In [35]:
our_classes['new_label'] =  [ (list(filter(lambda x:x if x  in generic_core_terms or (get_plural_tf_idf_value(x, idx) >= 0.3 or (len(row['new_label'])==1)) else "", row['new_label']))) for idx, (_, row) in enumerate(our_classes.iterrows())]

In [36]:
label_embeddings = []

global rows_
def get_sentence_embeddings_labels(word_list):
    
    embeddings_ = [
    embeddings_index.get(word,np.zeros(300))
        for word in word_list
    ]

    embeddings_ = np.mean(embeddings_, axis=0)
    
    if (len(embeddings_)) < 400:
        embeddings_= np.pad(embeddings_, (0, (400-len(embeddings_))), mode = 'constant')

    return embeddings_

label_embeddings = np.array(our_classes.apply(lambda x: get_sentence_embeddings_labels(x['new_label']), axis=1).to_list())



### Filtering Irrelevant Terms, Locations, and Person Names

To improve the quality of our label and company text representations, we also remove **irrelevant terms**, such as **location names** and **person names**, which are not useful for our classification task.

#### Named Entity Filtering:
We use **spaCy's multilingual model** (`xx_ent_wiki_sm`) to detect and remove:
- **Named entities** identified as locations or persons for multiple languagesx

#### Additional Heuristics:
- If a term is identified as a **person or location** and **contains a hyphen** (e.g., `"Cafe-Cafe"`), we double-check its relevance. These might be **misclassified**, so we verify them against our embeddings.
- We check whether each term is present in the **`embeddings_index`**:
  - If it is **not found**, it's assumed to be irrelevant or out-of-vocabulary.
  - If it is found but has a **low cosine similarity score** to any label embedding, we also filter it out.

#### Storage:
All terms that meet the criteria for removal are added to a set called:
```python
is_person_or_location_or_irrelevant_terms

In [37]:
nlp2 = spacy.load('xx_ent_wiki_sm') 
nlp_title_ours2 = nlp2(title_company_complete_full.lower())

### Filtering Irrelevant Named Entities

We filter out terms that may be wrongly assigned as named entities (`ORG`, `PRODUCT`) but are irrelevant for our labels.  
Words are checked using their embeddings: if a word has low similarity to label embeddings or is unknown, it is added to a set of irrelevant terms.


In [38]:
is_person_or_location_or_irrevalent_terms = set()

for i in nlp_title_ours2.ents:
    if i.label_ == "ORG" or i.label_ == "PRODUCT":
        for sp in i.text.split(" "):
            if len(sp) == 1:
                is_person_or_location_or_irrevalent_terms.add(sp.lower())
                continue
            if sp in embeddings_index.keys():
                word_e = embeddings_index[sp.lower()]

                if (len(word_e)) < 400:
                        word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                word_e = np.array(word_e).reshape(1, -1)

                similarities = []
                sim = cosine_similarity(word_e, label_embeddings)

                if max(sim[0]) < 0.2:
                    is_person_or_location_or_irrevalent_terms.add(sp.lower())
            else:
                is_person_or_location_or_irrevalent_terms.add(sp.lower())

                                        
    else:
        if len(i.text.split("-"))>1:

            split_words = re.split(r'[-\s]+', i.text)
            for sp in split_words:

                if sp in embeddings_index.keys():
                    if len(sp) == 1:
                        is_person_or_location_or_irrevalent_terms.add(sp.lower())
                        continue
                    word_e = embeddings_index[sp.lower()]

                    if (len(word_e)) < 400:
                            word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                    word_e = np.array(word_e).reshape(1, -1)

                    similarities = []
                    sim = cosine_similarity(word_e, label_embeddings)

                    if max(sim[0]) < 0.2:
                        is_person_or_location_or_irrevalent_terms.add(sp.lower())
                else:
                    is_person_or_location_or_irrevalent_terms.add(sp.lower())

                    
        else:
            is_person_or_location_or_irrevalent_terms.add(i.text.lower())





### Enhanced Filtering of Irrelevant Named Entities

We refine entity filtering by analyzing both individual words and compound phrases from business-related tags.  
The goal is to discard terms that are irrelevant or weakly associated with our label embeddings.


In [39]:
is_person_or_location_or_irrevalent_terms2 = set()
ignore_in_compond_word = set()

for nlp_bt in nlp_business_tags:
    for i in nlp_bt.ents:
            
        if i.label_ == "ORG" or i.label_ == "PRODUCT":
            for sp in i.text.split(" "):
                if len(sp) == 1:
                    is_person_or_location_or_irrevalent_terms2.add(sp.lower())
                    continue
                if sp not in embeddings_index.keys():
                    is_person_or_location_or_irrevalent_terms2.add(sp.lower())
                  
        else:
            if (len(i.text.split("-"))>1 or len(i.text.split(" "))>1) and not(i.label_ == "CARD" or i.label_ == "ORDINAL" or i.label_ == "TIME" or i.label_ == "DATE"):
                split_words = re.split(r'[-\s]+', i.text)
                for sp in split_words:

                    if sp in embeddings_index.keys():
                        if len(sp) == 1:
                            is_person_or_location_or_irrevalent_terms2.add(sp.lower())
                            continue
                        word_e = embeddings_index[sp.lower()]

                        if (len(word_e)) < 400:
                                word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                        word_e = np.array(word_e).reshape(1, -1)

                        similarities = []
                        sim = cosine_similarity(word_e, label_embeddings)
                        
                        if(max(sim[0])< 0.35):
                            word_ctx =  nlp(sp)
                            if len(word_ctx.ents) > 0 and not(word_ctx.ents[0].label_ == "ORG" or word_ctx.ents[0].label_ == "PRODUCT"):
                                is_person_or_location_or_irrevalent_terms2.add(sp.lower())

                    else:

                        is_person_or_location_or_irrevalent_terms2.add(sp.lower())                        
            else:


                word_e = embeddings_index.get(i.text.lower(), np.zeros(300))

                if (len(word_e)) < 400:
                        word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                word_e = np.array(word_e).reshape(1, -1)
                sim = cosine_similarity(word_e, label_embeddings)
                pattern_float = r"^-?\d+\.\d+$"
                if max(sim[0]) >= 0.30 and (not i.text.isnumeric() and not re.match(pattern_float, i.text)) and i.label_ != "DATE":
                    continue

                if max(sim[0]) >= 0.30 and (not i.text.isnumeric() and not re.match(pattern_float, i.text)) and i.label_ == "DATE":
                    ignore_in_compond_word.add(i.text)

                is_person_or_location_or_irrevalent_terms2.add(i.text.lower().replace(" ", "_"))
                list_text = list(new_ngrams(i.text.lower().replace("_", " ").replace("-", " ").split(" "), 2))
                for j in list_text:
                    if(j not in embeddings_index.keys()):
                        is_person_or_location_or_irrevalent_terms2.add(j.lower())
                        continue
                    word_e = embeddings_index[j.lower()]

                    if (len(word_e)) < 400:
                            word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                    word_e = np.array(word_e).reshape(1, -1)
                    sim = cosine_similarity(word_e, label_embeddings)

                    if max(sim[0]) < 0.3:
                        if (max(sim[0])) > 0.25:
                            ignore_in_compond_word.add(j.lower())
                            
                        is_person_or_location_or_irrevalent_terms2.add(j.lower())
                    else:
                        if(max(sim[0])< 0.35):
                            word_ctx =  nlp(j)

                            if len(word_ctx.ents) > 0 and not(word_ctx.ents[0].label_ == "PERSON" or word_ctx.ents[0].label_ == "ORG" or word_ctx.ents[0].label_ == "PRODUCT"):
                                is_person_or_location_or_irrevalent_terms2.add(j.lower())

                if(i.label_ == "LOC" or i.label_=="GPE"):
                    i_final=i.text.lower().replace(" ", "_")
                    for el in i_final.split("_"):
                        if(el not in embeddings_index.keys()):
                            is_person_or_location_or_irrevalent_terms2.add(el.lower())
                            continue
                        word_e = embeddings_index[el.lower()]

                        if (len(word_e)) < 400:
                                word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                        word_e = np.array(word_e).reshape(1, -1)
                        sim = cosine_similarity(word_e, label_embeddings)
                        if max(sim[0]) < 0.3:
                            if (max(sim[0])) > 0.25:
                                ignore_in_compond_word.add(el.lower())

                            is_person_or_location_or_irrevalent_terms2.add(el.lower())
                        else:
                            if(max(sim[0])< 0.35):
                                word_ctx =  nlp(el)

                                if len(word_ctx.ents) > 0 and not(word_ctx.ents[0].label_ == "PERSON" or word_ctx.ents[0].label_ == "ORG" or word_ctx.ents[0].label_ == "PRODUCT"):
                                    is_person_or_location_or_irrevalent_terms2.add(el.lower())





### Filtering Irrelevant Terms from Descriptions

We apply the same logic to entity extraction from **description fields**, ensuring only meaningful terms are kept for label prediction.

In [40]:
for nlp_desc in nlp_description:
    for i in nlp_desc.ents:
     
        if i.label_ == "ORG" or i.label_ == "PRODUCT":
            for sp in i.text.split(" "):

                if len(sp) == 1:
                    is_person_or_location_or_irrevalent_terms2.add(sp.lower())
                    continue
                if sp not in embeddings_index.keys():
                    is_person_or_location_or_irrevalent_terms2.add(sp.lower())
                  
        else:
            if len(i.text.split("-"))>1 and not(i.label_ == "CARD" or i.label_ == "ORDINAL" or i.label_ == "TIME" or i.label_ == "DATE"):
                split_words = re.split(r'[-\s]+', i.text)
                for sp in split_words:
                    if sp in embeddings_index.keys():
                        if len(sp) == 1:
                            is_person_or_location_or_irrevalent_terms2.add(sp.lower())
                            continue
                        word_e = embeddings_index[sp.lower()]

                        if (len(word_e)) < 400:
                                word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                        word_e = np.array(word_e).reshape(1, -1)

                        similarities = []
                        sim = cosine_similarity(word_e, label_embeddings)
                        if max(sim[0]) < 0.3:
                            if (max(sim[0])) > 0.25:
                                ignore_in_compond_word.add(sp.lower())
                            is_person_or_location_or_irrevalent_terms2.add(sp.lower())
                        else:
                            if(max(sim[0])< 0.35):
                                word_ctx =  nlp(sp)
                                if len(word_ctx.ents) > 0 and not(word_ctx.ents[0].label_ == "PERSON" or word_ctx.ents[0].label_ == "ORG" or word_ctx.ents[0].label_ == "PRODUCT"):
                                    sim1 = cosine_similarity(word_e, label_embeddings)
                                    is_person_or_location_or_irrevalent_terms2.add(sp.lower())

                    else:
                        is_person_or_location_or_irrevalent_terms2.add(sp.lower())                        
            else:
                word_e = embeddings_index.get(i.text.lower(), np.zeros(300))

                if (len(word_e)) < 400:
                        word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                word_e = np.array(word_e).reshape(1, -1)
                sim = cosine_similarity(word_e, label_embeddings)
                pattern_float = r"^-?\d+\.\d+$"
                val = max(sim[0])

                if max(sim[0]) >= 0.30 and (not i.text.isnumeric() and not re.match(pattern_float, i.text)) and i.label_ != "DATE":
                    continue
                

                if max(sim[0]) >= 0.30 and (not i.text.isnumeric() and not re.match(pattern_float, i.text)) and i.label_ == "DATE":
                    ignore_in_compond_word.add(i.text)

                is_person_or_location_or_irrevalent_terms2.add(i.text.lower().replace(" ", "_"))
                
                list_text = list(new_ngrams(i.text.lower().replace("_", " ").replace("-", " ").split(" "), 2))
                for j in list_text:
                    if(j not in embeddings_index.keys()):
                        is_person_or_location_or_irrevalent_terms2.add(j.lower())
                        continue
                    word_e = embeddings_index[j.lower()]
                    word_e_list = [np.pad(embeddings_index.get(wd, np.zeros(300)), (0, (400-len(embeddings_index.get(wd, np.zeros(300))))), mode = 'constant') for wd in j.split("_")]

                    if (len(word_e)) < 400:
                            word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                    word_e = np.array(word_e).reshape(1, -1)
                    sim = cosine_similarity(word_e, label_embeddings)
                    sim_full = cosine_similarity(word_e_list, label_embeddings)
                    sim_full_to_list = []
                    max_v = -1.0
                    index_v = -1
                    for idx, zzz in enumerate(sim_full):
                        cur_element_max = np.max(zzz)
                        if cur_element_max > max_v:
                            index_v = idx
                            max_v = cur_element_max


                    if max_v > 0.4:
                        lit_g = j.split("_")[index_v]

                
                    if max(sim[0]) < 0.3:
                        if (max(sim[0])) > 0.25:
                            ignore_in_compond_word.add(j.lower())


                        is_person_or_location_or_irrevalent_terms2.add(j.lower())
                    else:
                        if(max(sim[0])< 0.35):
                            word_ctx =  nlp(j)
                            if len(word_ctx.ents) > 0 and not(word_ctx.ents[0].label_ == "PERSON" or word_ctx.ents[0].label_ == "ORG" or word_ctx.ents[0].label_ == "PRODUCT"):
                                is_person_or_location_or_irrevalent_terms2.add(j.lower())

                if(i.label_ == "LOC" or i.label_=="GPE"):
                   
                    i_final=i.text.lower().replace(" ", "_")
                    for el in i_final.split("_"):
                        if(el not in embeddings_index.keys()):
                            is_person_or_location_or_irrevalent_terms2.add(el.lower())
                            continue
                        word_e = embeddings_index[el.lower()]

                        if (len(word_e)) < 400:
                                word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

                        word_e = np.array(word_e).reshape(1, -1)
                        sim = cosine_similarity(word_e, label_embeddings)
                        if max(sim[0]) < 0.3:
                            if (max(sim[0])) > 0.25:
                                ignore_in_compond_word.add(el.lower())
                            
                            is_person_or_location_or_irrevalent_terms2.add(el.lower())
                        else:
                            if(max(sim[0])< 0.35):

                                word_ctx =  nlp(el)
                                if len(word_ctx.ents) > 0 and not(word_ctx.ents[0].label_ == "PERSON" or word_ctx.ents[0].label_ == "ORG" or word_ctx.ents[0].label_ == "PRODUCT"):
                                    
                                    is_person_or_location_or_irrevalent_terms2.add(el.lower())

In [41]:
nlp_title_ours = nlp(title_company_complete_full.lower())

### Description Text Cleaning with Named Entity Filtering

When processing the **company descriptions**, we apply a more refined filtering step to remove **unnecessary or irrelevant terms** that do not contribute meaningful information for classification.

#### Named Entity Filtering:
Using spaCy's entity recognition, we filter out tokens that are identified as any of the following entity types:
- **Person**
- **Location**
- **Geopolitical Entity (GPE)**
- **Cardinal** (e.g., numbers like "three", "100")
- **Ordinal** (e.g., "first", "second")

These entities are often not useful for identifying the company’s industry or services, and are removed to **clean the text** and reduce semantic noise.

This helps ensure that the remaining tokens focus on **core business-related terms**, improving the quality of embeddings and downstream label matching.

In [42]:
dict_gpe_company_elements = {}

for i in nlp_description:
    for el in i.ents:
        if (el.label_ != "ORG" and el.label_ != "PRODUCT"):
         for words in el.text.split(" "):
            dict_gpe_company_elements[words.lower()] = 1

In [43]:
def replace_word_2(text, text_original, next_word, next_word_original, previous_word, previous_word_original, row_no2=None):

    
    if len(text) <= 1:
        return text
    if text[0] == '-':
        text_original = text_original[1:]
        text = text[1:]
    if text[-1] == "-":
        text_original = text_original[:-1]
        text = text[:-1]

    if "'s" not in text and "'" in text:
        text_original = text_original.replace("'", "")
        text = text.replace("'","")

    
    if text_original in embeddings_index.keys():
        
        if next_word != None and text != None and next_word_original in embeddings_index.keys() and f"{text}_{next_word}" not in embeddings_index.keys() and f"{text_original}_{next_word_original}" in embeddings_index.keys():
            return text_original
        
        if previous_word != None and text != None and previous_word_original in embeddings_index.keys() and f"{previous_word}_{text}" not in embeddings_index.keys() and f"{previous_word_original}_{text_original}" in embeddings_index.keys():
            return text_original
        

        if len(text) > 2 and text[-2:]=="um" and text_original[-1]=="a":
            return text_original
    


    if text in embeddings_index.keys():
        return text
    return text_original

**Data Cleaning**

To ensure consistency and avoid issues during processing, we removed all rows containing **missing values**.  
Additionally, we excluded the `sector` column from our analysis, as it provided minimal value for the classification task and did not contribute meaningful signals for label assignment.

In [44]:
colums = [('description',0), ('business_tags',1), ('category',2), ('niche',3)]

**Token Filtering**

During tokenization, we retained only **adjectives**, **verbs**, and **nouns** to improve clarity and reduce noise.  
This focused filtering ensures that we capture the most meaningful and descriptive terms, while eliminating less relevant tokens such as stopwords, conjunctions, or determiners.

In [45]:
def replace_tag(element, tokens, original_text, row_no):

    
    
    
    match tokens:
        case 'CD':
            res = any(char.isalpha() for char in element)
            if res:
                tokens = 'a'
                return tokens
            return 'n'
        case 'JJ':
            tokens = 'a'
            return tokens
        case 'NN' | 'NNS' | 'NNP' | 'NNPS':
            tokens = 'n'
            return tokens
        case 'VB' | 'VBG' | 'VBD' | 'VBZ' | 'VBN' | 'VBP':
            return 'v'
        case _:
            if(element not in embeddings_index.keys()):
                return 'n'
            tokens = 'z'
            return tokens

In [46]:
import faiss
def remove_irrelevalent_ngrams_with_stop_words(wd):
    is_containing_stop_words = any(val in stop_words.STOP_WORDS for val in wd.split("_"))
    if wd.split("_")[0] in stop_words.STOP_WORDS:
        return False

    if not is_containing_stop_words:
        return True

    word_e = embeddings_index[wd.lower()]
    if (len(word_e)) < 400:
        word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')
    word_e = word_e.reshape(1,-1).astype(np.float32, order='C')
    faiss.normalize_L2(word_e)

    le1 = np.array(label_embeddings, dtype=np.float32, order='C')
    faiss.normalize_L2(le1)

    el = max(np.dot(word_e, le1.T)[0])

    if el >= 0.30:
        return True
    return False

### Company Text Cleaning and Filtering

Just like we did for the labels, we also applied a similar cleaning process to the **company descriptions**. To make this more efficient, we used **index and end lists** to manage the token positions for each company, which helps us process only the relevant text segments efficiently.

#### Cleaning Steps:
- We removed **irrelevant terms** such as:
  - **Punctuation**
  - Previously identified terms stored in `dict_gpe_company_elements`
  - **Stopwords** (common words with little semantic value)
- We kept only tokens that are:
  - **Adjectives**
  - **Nouns**
  - **Verbs**

This ensures we focus only on the most **informative words** in each company’s description, improving both the semantic quality and classification performance.

In [47]:
def get_relevant_words(current_list):
    word_embedding_list1 = np.array([np.pad(embeddings_index.get(wd.lower(), np.zeros(300)), (0, label_embeddings[0].shape[0] - len(embeddings_index.get(wd.lower(), np.zeros(300)))), mode='constant') for wd in current_list])
    
    we1 = np.array(word_embedding_list1, dtype=np.float32, order='C')
    faiss.normalize_L2(we1)

    le1 = np.array(label_embeddings, dtype=np.float32, order='C')
    faiss.normalize_L2(le1)

    el = np.dot(we1, le1.T)

    word_list = [wd for wd in current_list]
    current_row = current_list.copy()
    len_current_row_original = len(current_row)
    
        
    current_row = current_list.copy()
    threshold=0.3

    for index in range(len(current_list)):
        max_val_word = max(el[index])
        if max_val_word <= threshold:
            current_row.remove(word_list[index])
            
    
    return current_row

### Handling Acronyms and Fallback Term Mapping

We use external assertions to improve term recognition:

- **Acronyms**: We detect acronyms by matching short forms to their expanded versions based on initials and verifying similarity using embeddings. Special cases like "abbreviation for", "short name for", and "acronym for" are also handled.
- **Plural and Fallback Mapping**: The `convertPluralToSingular` mapping (name slightly misleading) is used not only for converting plurals to singulars but also as a fallback to find alternate forms of words missing from the embeddings.
- **Fallback for Missing Terms**: If a word is not found in the embeddings, we first attempt to map it using `convertPluralToSingular` before further processing.

This enhances entity extraction by recovering better word matches when embeddings alone are insufficient.

In [48]:
import csv
import time
antonyms_words = set()
convertPluralToSingular = {}

with open("../inputs/filtered_assertions.txt") as f:
    rows = f.read().split("\n")
    for row in rows:
        if row=="":
            continue
        words = row.split(" ")

        if (words[0]=="/r/IsA"):
            word1 = words[1]
            word2 = words[2]
            if (len(word1) == 2 or len(word1) == 3) and not word1.isnumeric() and len(word2.split("_"))>1:
                a, b = word2.split("_")[0], word2.split("_")[1]
                c = None
                if len(word1) == 3:
                    if len(word2.split("_"))>2:
                        c = word2.split("_")[2]
                if word1 in embeddings_index.keys() and word2 in embeddings_index.keys() and a[0]==word1[0] and b[0]==word1[1] and (c == None or c[0] == word1[2]):
                    current_val = np.dot(embeddings_index[word1].reshape(1,-1), embeddings_index[word2].reshape(1,-1).T)
                    if (current_val > 0.5):
                        acronym_dict[word1] = word2
                elif word1 in embeddings_index.keys() and len(word2.split("abbreviation_for_")) > 1 and word2.split("abbreviation_for_")[1] in embeddings_index.keys():
                    word2 = word2.split("abbreviation_for_")[1]
                    current_val = (np.dot(embeddings_index[word1].reshape(1,-1), embeddings_index[word2].reshape(1,-1).T))
                    if (current_val > 0.5):
                        acronym_dict[word1] = word2
                elif word1 in embeddings_index.keys() and len(word2.split("short_name_for_")) > 1 and word2.split("short_name_for_")[1] in embeddings_index.keys():
                    word2 = word2.split("short_name_for_")[1]
                    current_val = (np.dot(embeddings_index[word1].reshape(1,-1), embeddings_index[word2].reshape(1,-1).T))
                    if (current_val > 0.5):
                        acronym_dict[word1] = word2
                elif word1 in embeddings_index.keys() and len(word2.split("acronym_for_")) > 1 and word2.split("acronym_for_")[1] in embeddings_index.keys():
                    word2 = word2.split("acronym_for_")[1]
                    current_val = (np.dot(embeddings_index[word1].reshape(1,-1), embeddings_index[word2].reshape(1,-1).T))
                    if (current_val > 0.5):
                        acronym_dict[word1] = word2
        if (words[0]=="/r/FormOf"):
            word1 = words[1]
            word2 = words[2]
            convertPluralToSingular[word1] = word2
        

### Token Processing and Lemmatization

The `lemmatize_f` function processes text from different columns (description, business tags, category, niche) to create clean, relevant token sets for further analysis.

### Main Steps

- **Lemmatization**: Extract base forms of words using spaCy, filtering by POS tags (`noun`, `adjective`, `verb`).
- **Sentence Splitting** (for descriptions): Focuses only on the first sentence for key term extraction.
- **Bigram and Trigram Generation**: Creates 2-word and 3-word combinations while filtering out irrelevant terms.
- **Acronym and Composite Handling**: Replaces detected acronyms and treats compound words consistently.
- **Stopword and Irrelevant Term Removal**: Filters out known irrelevant, short, numeric, and stopword tokens.
- **First Sentence Extraction**: For descriptions, focuses on words from the first sentence after the title, assuming they represent the most important content. These important words are specifically filtered and saved (in `most_important_words` / `elements_add_important_words`).

- **Fallback Corrections**:
  - Corrects British vs American spelling differences (`er` ↔ `re`, `ou` ↔ `o`).
  - Uses `convertPluralToSingular` for additional fallback mapping if needed.
- **Final Output**: Returns a frequency counter of the cleaned and corrected tokens.

### Goal

Extract meaningful, context-aware tokens that are robust across different text fields for downstream tasks like classification or matching.

In [49]:
df2['new_business_tags_lemmatized'] = df2['business_tags'].copy()

In [50]:

elements_add_important_words = []

end_of_first_sentence = []
def lemmatize_f(tokens, col, row_no, row_no2, description, business_tags, category):
    pattern_float = r"^-?\d+\.\d+$"
    elements3 = None
    if col == "description":
        global nlp_description
        global index_end_description
        global index_group_description
        global dict_gpe_company_elements
        start_from_here = -1
        if row_no == 0 or (index_group_description[row_no]!=0 and index_group_description[row_no-1]!=index_group_description[row_no]):
            start_from_here = 0
        else:
            start_from_here = index_end_description[row_no-1]+1
        tok = nlp_description[index_group_description[row_no]][start_from_here:index_end_description[row_no]]
       
        elements = [i.lemma_ for i in tok]
        g=0

        if "." in elements:
            g+=1
            
            if len(elements) != elements.index('.')+1 and (elements[elements.index('.')+1]==','):
                end_of_first_sentence.append(elements[elements.index('.')+1:].index('.'))
                elements = elements[0:elements[elements.index('.')+1:].index('.')]
            else:
                end_of_first_sentence.append(elements.index('.'))
                elements =elements[0:elements.index('.')]
                
        

        elements3 = list(filter(lambda x: x not in embeddings_index.keys() and len(x.split("_"))>1, elements))
        
        elements2 = list(map(lambda x: (replace_word_2(tok[x].lemma_, tok[x].text, tok[x+1].lemma_, tok[x+1].text, None, None),replace_tag(tok[x].lemma_, tok[x].tag_, tok[x].text, row_no)) if x==0 else (replace_word_2(tok[x].lemma_, tok[x].text, None, None, tok[x-1].lemma_, tok[x-1].text),replace_tag(tok[x].lemma_, tok[x].tag_, tok[x].text, row_no)) if x==len(tok)-1 else (replace_word_2(tok[x].lemma_, tok[x].text, tok[x+1].lemma_, tok[x+1].text, tok[x-1].lemma_, tok[x-1].text),replace_tag(tok[x].lemma_, tok[x].tag_, tok[x].text, row_no)), range(len(tok[0:len(elements)]))))

        
        tokens_raw_elements = list(map(lambda x: x[0], elements2))

        list_bigrams_elements = list(new_ngrams(tokens_raw_elements, 2, is_person_or_location_or_irrevalent_terms2,ignore_in_compond_word, row_no2))
        list_trigrams_elements = list(new_ngrams(tokens_raw_elements, 3, is_person_or_location_or_irrevalent_terms2,ignore_in_compond_word, row_no2))
        elements2 = treat_composite_words(elements2)
        
        
        elements3_copy = []
        for an_element in elements3:

            for split_word in an_element.split("_"):
                elements3_copy.append(split_word)
        elements3 = elements3_copy
        elements3 = list(map(lambda x: replace_upper_word([x], acronym_dict)[0] if x.isupper() else replace_upper_word([x.lower()], acronym_dict)[0], elements3))
        elements3= list(filter(lambda x: x.lower() not in is_person_or_location_or_irrevalent_terms2 and x.lower() not in stop_words.STOP_WORDS and not x.isnumeric(), elements3))



        elements2 = list(filter(lambda x: (x[0].lower(),x[1]) if x[0].lower() not in stop_words.STOP_WORDS and x[0].lower() in embeddings_index.keys()  and (x[1]=='n' or x[1]=='a' or x[1] == 'v') and (not x[0].isnumeric() and not re.match(pattern_float, x[0])) else '', elements2))
        

        
        elements2 = list(filter(lambda x: x[0].lower() not in is_person_or_location_or_irrevalent_terms2, elements2))
        elements2 = list(map(lambda x: replace_upper_word([x[0]], acronym_dict)[0] if x[0].isupper() else replace_upper_word([x[0].lower()], acronym_dict)[0], elements2))

        elements2.extend(list(list_bigrams_elements))
        elements2.extend(list(list_trigrams_elements))
        if elements3 != []:
            elements3 = get_relevant_words(elements3)
        elements2.extend(elements3)
        elements_add_important_words.append(elements2)
        

    

    elif col == "business_tags":
        global nlp_business_tags
        global index_end_business_tags
        global index_group_business_tags
        start_from_here = -1
        if row_no == 0 or (index_group_business_tags[row_no]!=0 and index_group_business_tags[row_no-1]!=index_group_business_tags[row_no]):
            start_from_here = 0
        else:
            start_from_here = index_end_business_tags[row_no-1]+1
        tok =[element for element in nlp_business_tags[index_group_business_tags[row_no]][start_from_here:index_end_business_tags[row_no]]]
    elif col == "category":
        global nlp_category
        global index_end_category
        start_from_here = -1
        if row_no == 0:
            start_from_here = 0
        else:
            start_from_here = index_end_category[row_no-1]
        tok = [element for element in nlp_category[start_from_here:index_end_category[row_no]]]
    else:
        global nlp_niche
        global index_end_niche
        start_from_here = -1
        if row_no == 0:
            start_from_here = 0
        else:
            start_from_here = index_end_niche[row_no-1]
        tok = [element for element in nlp_niche[start_from_here:index_end_niche[row_no]]]
    
    if len(tok)>1:
        tokens = list(map(lambda x:  (replace_word_2(tok[x].lemma_, tok[x].text, tok[x+1].lemma_, tok[x+1].text, None, None),replace_tag(tok[x].lemma_, tok[x].tag_, tok[x].text, row_no)) if x==0 else (replace_word_2(tok[x].lemma_, tok[x].text, None, None, tok[x-1].lemma_, tok[x-1].text),replace_tag(tok[x].lemma_, tok[x].tag_, tok[x].text, row_no)) if x==len(tok)-1 else (replace_word_2(tok[x].lemma_, tok[x].text, tok[x+1].lemma_, tok[x+1].text, tok[x-1].lemma_, tok[x-1].text),replace_tag(tok[x].lemma_, tok[x].tag_, tok[x].text, row_no)), range(len(tok))))
        
    elif len(tok)==1:
        tokens = list(map(lambda x:  (replace_word_2(tok[x].lemma_, tok[x].text, None, None, None, None),replace_tag(tok[x].lemma_, tok[x].tag_, tok[x].text, row_no)), range(len(tok))))
    else:
        tokens = []

   
    
    tokens_raw = list(map(lambda x: x[0], tokens))
  
    list_bigrams = list(new_ngrams(tokens_raw, 2, is_person_or_location_or_irrevalent_terms2,ignore_in_compond_word, row_no2))
   
    list_trigrams = list(new_ngrams(tokens_raw, 3, is_person_or_location_or_irrevalent_terms2,ignore_in_compond_word, row_no2))
    
 
    list_bigrams = (list(filter(lambda x: x if remove_irrelevalent_ngrams_with_stop_words(x) else '', list_bigrams)))
    list_trigrams = (list(filter(lambda x: x if remove_irrelevalent_ngrams_with_stop_words(x) else '', list_trigrams)))
    tokens = treat_composite_words(tokens)
    tokens = list(filter(lambda x: (x[0].lower(),x[1]) if x[0].lower() not in stop_words.STOP_WORDS and x[0].lower() in embeddings_index.keys()  and (x[1]=='n' or x[1]=='a' or x[1] == 'v') and (col!="description" or x[0].lower() not in is_person_or_location_or_irrevalent_terms2) and (not x[0].isnumeric() and not re.match(pattern_float, x[0])) else '', tokens))
    tokens = list(filter(lambda x: x[0].lower() not in is_person_or_location_or_irrevalent_terms2, tokens))


    tokens = list(map(lambda x: replace_upper_word([x[0]], acronym_dict)[0] if x[0].isupper() else replace_upper_word([x[0].lower()], acronym_dict)[0], tokens))
    full_list = None
    if elements3!=None:
        full_list = tokens + list_bigrams + list_trigrams + elements3
    else:
        full_list = tokens + list_bigrams + list_trigrams

    tokens1 = tokens.copy()

    
    if col=="business_tags":
        tokens1.extend(description)
    if col=="category":
        tokens1.extend(description)
        tokens1.extend(business_tags)
    if col=="niche":
        tokens1.extend(description)
        tokens1.extend(business_tags)
        tokens1.extend(category)


        
    for idx, wd in enumerate(full_list):
        
        if wd.replace("er", "re") in tokens1 and wd.replace("er","re")!=wd and cosine_similarity(embeddings_index[wd.replace("er","re")].reshape(1,-1), embeddings_index[wd].reshape(1,-1))>0.85:
            full_list[idx] = wd.replace("er", "re")
        elif wd.replace("ou", "o") in tokens1 and wd.replace("ou","o")!=wd and cosine_similarity(embeddings_index[wd.replace("ou","o")].reshape(1,-1), embeddings_index[wd].reshape(1,-1))>0.85:
            full_list[idx] = wd.replace("ou", "o")
        elif wd in convertPluralToSingular and convertPluralToSingular[wd] in tokens1 and cosine_similarity(embeddings_index[convertPluralToSingular[wd]].reshape(1,-1), embeddings_index[wd].reshape(1,-1))>0.75:
            
            full_list[idx] =convertPluralToSingular[wd]

    
    counter_element = Counter(full_list)
    

    return counter_element


In [51]:
all_punctuation_except_hyphen = string.punctuation.replace("-","").replace("'","").replace("’","")
df['rows'] = range(0, len(df))


In [52]:
df['rows_33'] = [idx for idx, _ in df.iterrows() ]

In [53]:
df['most_important_words'] = df['description']

In [54]:
start_from_here = -1
row_no=1
if row_no == 0 or (index_group_business_tags[row_no]!=0 and index_group_business_tags[row_no-1]!=index_group_business_tags[row_no]):
    start_from_here = 0
else:
    start_from_here = index_end_business_tags[row_no-1]+1
el_a = [element for element in nlp_business_tags[index_group_business_tags[1]][start_from_here:index_end_business_tags[1]]]
copy_a = el_a.copy()

In [55]:
import pandas as pd

for col, idx in colums:
    df[col] = df.apply(lambda x: lemmatize_f(x[col], col, x['rows'], x['rows_33'], x['description'], x['business_tags'], x['category']), axis =1)

In [56]:
z=0
for idx, _ in df.iterrows():
    df.at[idx, 'most_important_words'] = elements_add_important_words[z]
    z+=1

In [57]:
def check_if_word_is_okay(x, elements_counter):
    elements = list(filter(lambda z: z in x, set(elements_counter)))
    return elements
    

In [58]:
dict_associate_index_with = {}
z=0
for idx, _ in df.iterrows():
    dict_associate_index_with[z]=idx
    z+=1

### Extracting Important Words from Titles

This part processes the titles to extract the most meaningful words, aiming to support downstream tasks like classification.

### Main Steps

- **Title Tokenization**: Titles are split based on the semicolon (`;`) separator.
- **Word Filtering**: Remove stopwords, irrelevant terms, punctuation, and very short words.
- **Word Validation**:
  - If a word is missing in embeddings, validate it using associated fields (description, niche, category, business tags).
  - Ensure words are semantically meaningful by checking their cosine similarity with label embeddings (threshold ≥ 0.35).
- **Important Word Collection**: 
  - Filtered and validated words are collected for each title.
  - Importance is emphasized by weighting (repeating the Counter multiple times).
- **Final Output**: 
  - The list `titles_important_word` stores a Counter for each title, containing its most important extracted words.

### Goal

Capture the most relevant words from titles, cleaned and validated, to enhance the quality of feature extraction.

In [59]:
from functools import reduce
titles_important_word = [None] * len(df['description'])
nlp_title_ours_list = [title.lemma_ for title in nlp_title_ours]
s=-1
z=0
word_no = -1

while len(nlp_title_ours_list)>word_no+1:
    word_no += nlp_title_ours_list[s+1:].index(";") + 1
    

    word_companies_list = nlp_title_ours_list[s+1:word_no]
    word_companies_list = [x for x in word_companies_list if x not in stop_words.STOP_WORDS and x not in is_person_or_location_or_irrevalent_terms and len(x) > 1 and x not in string.punctuation]
    word_companies_list = [item for sublist in word_companies_list for item in (sublist if isinstance(sublist, list) else [sublist])]

 

    list_titles=[]
    wd_list = []
    for sp in word_companies_list:
        if sp not in embeddings_index.keys():
            wd_list.extend(check_if_word_is_okay(sp.lower(), df['description'][dict_associate_index_with[z]]+df['niche'][dict_associate_index_with[z]]+df['category'][dict_associate_index_with[z]]+df['business_tags'][dict_associate_index_with[z]]))
        else:
            wd_list.append(sp.lower())
    word_companies_list = wd_list  
    for sp in word_companies_list:

        if sp in is_person_or_location_or_irrevalent_terms:
             continue

        if len(sp) == 1:
            is_person_or_location_or_irrevalent_terms.add(sp.lower())
            continue
             
        word_e = embeddings_index[sp.lower()]

        if (len(word_e)) < 400:
                word_e= np.pad(word_e, (0, (400-len(word_e))), mode = 'constant')

        word_e = np.array(word_e).reshape(1, -1)

        similarities = []
        sim = cosine_similarity(word_e, label_embeddings)
        pattern_float = r"^-?\d+\.\d+$"
        if max(sim[0]) >= 0.35 and (not sp.isnumeric() and not re.match(pattern_float, sp)):
            list_titles.append(sp)
    

    s=word_no
    title_counter = Counter(list_titles)
    titles_important_word[z] = title_counter+title_counter+title_counter+title_counter
    z+=1


In [60]:
df['title_description'] = df['description']

df.loc[:, 'title_description'] = titles_important_word

In [61]:
df_temp = df.copy()

In [62]:
df['description'] +=df['title_description']

In [63]:
df['new_col'] = df['description']+df['business_tags']+ df['category'] + df['niche']

In [64]:
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def get_tfidf_sparse_matrix(col_val):
    v = DictVectorizer(sparse=True)
    X = v.fit_transform(col_val)
    feature_names = v.get_feature_names_out()
    X_csr = csr_matrix(X)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tf_idf = tfidf_transformer.fit_transform(X)
    return X_csr, feature_names, tf_idf, X
X_csr, feature_names, tf_idf, X = get_tfidf_sparse_matrix(df['new_col'])


In [65]:

def get_dictionary_for_tfidif(X_csr, feature_names, tf_idf, X):
    num_threads = 8
    threads = []
    size = X_csr.shape[0]//num_threads

    global match_words_with_tf_idf_valuess
    match_words_with_tf_idf_valuess={}

    global match_average_words_with_tf_idf_valuess
    match_average_words_with_tf_idf_valuess={}

    global match_number_words_with_tf_idf_valuess
    match_number_words_with_tf_idf_valuess={}

    g=[]
    global zzz
    zzz = 0

    def get_matrix(i, start, end):
        for k in range(start, end):
            row_tfidf = X[k].toarray()[0]
            nonzero_indices = np.nonzero(row_tfidf)[0]
            g=0
            for j in nonzero_indices:
                g+=1
                a = round(tf_idf[k,j],5)
                
                match_words_with_tf_idf_valuess[(feature_names[j], k)] = a
                if feature_names[j] not in match_average_words_with_tf_idf_valuess.keys():
                    match_average_words_with_tf_idf_valuess[feature_names[j]]=a
                    match_number_words_with_tf_idf_valuess[feature_names[j]]=1
                else:
                    match_average_words_with_tf_idf_valuess[feature_names[j]]+=a
                    match_number_words_with_tf_idf_valuess[feature_names[j]]+=1


        return match_words_with_tf_idf_valuess

    for i in range(num_threads):
        start = i * size
        end = (i+1) * size if i != num_threads -1 else X_csr.shape[0]
        thread = threading.Thread(target = get_matrix, args=(i, start, end))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

get_dictionary_for_tfidif(X_csr, feature_names, tf_idf, X)


In [66]:
df2['tags_for_bt'] = df2['business_tags'].copy()
for idx, _ in df.iterrows():
    list_elements_all = []
    determine_tag_all = []
    start_from_here = -1
    if idx == 0 or (index_group_business_tags[idx]!=0 and index_group_business_tags[idx-1]!=index_group_business_tags[idx]):
        start_from_here = 0
    else:
        start_from_here = index_end_business_tags[idx-1]+1
    tok =[element for element in nlp_business_tags[index_group_business_tags[idx]][start_from_here:index_end_business_tags[idx]]]
    list_elements = []
    determine_tag = []
    for i in tok:
        if i.lemma_ == "," and list_elements!=[]:
            list_elements_all.append(list_elements)
            determine_tag_all.append(determine_tag)
            list_elements = []
            determine_tag = []
        elif i.lemma_ not in stop_words.STOP_WORDS and i.lemma_ in match_average_words_with_tf_idf_valuess.keys() and ((match_average_words_with_tf_idf_valuess[i.lemma_]/match_number_words_with_tf_idf_valuess[i.lemma_])>0.075 or i.lemma_ in set_sectors_elemenets):
            list_elements.append(i.lemma_)
            determine_tag.append(i.tag_)
    if list_elements != []:
        list_elements_all.append(list_elements)
        determine_tag_all.append(determine_tag)
    
    df2.at[idx, 'tags_for_bt'] = determine_tag_all
    df2.at[idx, 'new_business_tags_lemmatized'] = list_elements_all

### TF-IDF Filtering for Company Data

As we did with the labels, we also apply a **TF-IDF matrix transformation** to the company text. This helps us identify the most informative terms based on their frequency and uniqueness across the dataset.

In [67]:
set_sectors = set(list(map(lambda x: x.replace(" ", "_",), list_sectors_elements)))

most_used_sector_terms = set()
most_used_sector_terms = set()

df_temp['description'] = [ (Counter({k:v for k,v in rows['description'].items() if match_words_with_tf_idf_valuess[(k, idx)] >=0.06 and k not in most_used_sector_terms })) for idx, (_, rows) in enumerate(df_temp.iterrows()) ]
df_temp['title_description'] = [ (Counter({k:v for k,v in rows['title_description'].items() if match_words_with_tf_idf_valuess[(k, idx)] >=0.06 and k not in most_used_sector_terms })) for idx, (_, rows) in enumerate(df_temp.iterrows()) ]

df['description'] = [ (Counter({k:v for k,v in rows['description'].items() if (match_words_with_tf_idf_valuess[(k, idx)]) >=0.04 or  (match_average_words_with_tf_idf_valuess[k]/match_number_words_with_tf_idf_valuess[k])>=0.08 and k not in most_used_sector_terms })) for idx, (_, rows) in enumerate(df.iterrows()) ]
df['business_tags'] = [ (Counter({k:v for k,v in rows['business_tags'].items() if (match_words_with_tf_idf_valuess[(k, idx)]) >=0.04 or  (match_average_words_with_tf_idf_valuess[k]/match_number_words_with_tf_idf_valuess[k])>=0.08 and k not in set_sectors_elemenets  })) for idx, (_, rows) in enumerate(df.iterrows()) ]
df['category'] = [ (Counter({k:v for k,v in rows['category'].items() if ((match_average_words_with_tf_idf_valuess[k]/match_number_words_with_tf_idf_valuess[k]) >=0.04 and k not in most_used_sector_terms)  or len(rows['category'])==1})) for idx, (_, rows) in enumerate(df.iterrows()) ]
df['niche'] = [ (Counter({k:v for k,v in rows['niche'].items() if ((match_average_words_with_tf_idf_valuess[k]/match_number_words_with_tf_idf_valuess[k]) >=0.04 and k not in most_used_sector_terms )or len(rows['niche'])==1 })) for idx, (_, rows) in enumerate(df.iterrows()) ]


In [68]:
threshold_each_row=[]
z=0
for idx, row in df.iterrows():
    elements=[]
    for i in df['new_col'][idx]:
        elements.append((match_words_with_tf_idf_valuess[(i, z)], i))
    elements.sort()
    max_elements = elements[len(elements)//5-1][0]
    threshold_each_row.append(max_elements)
    z+=1



In [69]:
df['most_important_words'] = [ [k for k in rows['most_important_words'] if k in match_average_words_with_tf_idf_valuess.keys() and match_average_words_with_tf_idf_valuess[k]/match_number_words_with_tf_idf_valuess[k]>=0.05 and k not in most_used_sector_terms ] for idx, (_, rows) in enumerate(df.iterrows())]

In [70]:
df_temp['most_important_words'] = [set(df.at[idx, 'most_important_words']) for idx, _ in df.iterrows()]

In [71]:
match_average_words_with_tf_idf_valuess2 = match_average_words_with_tf_idf_valuess.copy()
match_number_words_with_tf_idf_valuess2 = match_number_words_with_tf_idf_valuess.copy()

In [72]:
most_important_words = []
for i, (_, row) in enumerate(df2.iterrows()):
    if "." in row['description']:
        elements = row['description']
        if len(elements) != elements.index('.')+1 and (elements[elements.index('.')+1]==','):
            elements = elements[0:elements[elements.index('.')+1:].index('.')]
        else:
            elements =elements[0:elements.index('.')]
        most_important_words.append(f"{elements}.")
    else:
        most_important_words.append(f"{row['description']}.")        

In [73]:
import numpy as np
import threading
import faiss
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


bert_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") 

our_classes_vector_embeddings = bert_model.encode(our_classes_vector)

texts_vector_description = []
company_texts_description = [
    f"{row['description']}."
    for idx, row in df2.iterrows()
]


company_texts_category = [
    f"This company operates in the category: {row['category']}"
    for index, row in df2.iterrows()
]


company_texts_niche= [
    f"This company operates in the niche: {row['niche']}"
    for index, row in df2.iterrows()
]

company_embeddings_1 = bert_model.encode(company_texts_description, batch_size=32, normalize_embeddings=True) 
ce1 = company_embeddings_1
faiss.normalize_L2(ce1)


company_embeddings_first_sentence = bert_model.encode(most_important_words, batch_size=32, normalize_embeddings=True) 
fist_sentence_emb = company_embeddings_first_sentence
faiss.normalize_L2(fist_sentence_emb)

ocve = our_classes_vector_embeddings
faiss.normalize_L2(ocve)
similarity_matrix = np.dot(ce1, ocve.T)
company_embeddings_2 = bert_model.encode(company_texts_category, batch_size=32, normalize_embeddings=True) 
ce2 = company_embeddings_2
faiss.normalize_L2(ce1)
similarity_matrix2 =  np.dot(ce1, ce2.T)
company_embeddings_3 = bert_model.encode(company_texts_niche, batch_size=32, normalize_embeddings=True) 
ce31 = company_embeddings_3
faiss.normalize_L2(ce31)

similarity_matrix3 =  np.dot(ce1, ce31.T)





In [74]:
set_sectors_elemenets = set(list(map(lambda x: x.replace(" ", "_",).lower(), list_sectors_elements)))
set_sector_elements = set(filter(lambda x: x!='' , set_sectors_elemenets))

In [75]:
similarity_matrix_first_sentence = np.dot(fist_sentence_emb, ocve.T)


In [76]:
similarity_matrix_label2 =  np.dot(ce2, ocve.T)
similarity_matrix_label3 =  np.dot(ce31, ocve.T)

### Hybrid Similarity: Token-Level and Contextual Embeddings

In addition to applying **embeddings to individual tokens** (which we later compare using **FAISS**), we also use **SentenceTransformer** to capture the **contextual meaning** of full text segments.

By combining both:
- **Token-level similarity** (fast, granular, useful for concept matching)
- **Sentence-level/contextual similarity** (semantic understanding of full text)

...we can improve the quality and flexibility of our label matching process.

We chose this **hybrid approach** because relying on multiple similarity strategies gives us a **more robust and accurate classification**, especially in ambiguous or fuzzy cases.

In [77]:
import faiss
MAX_TOKENS = max(df['new_col'].apply(len))
global rows_

def get_sentence_embeddings_for_categories(word_list):
    embeddings_ = []
    new_list = [word for word in word_list]
    
    embeddings_ = [
        (embeddings_index.get(word, np.zeros(300)
    )) for word in new_list]

    if (word_list == Counter()):
        return [np.zeros(300) for _ in range(150)]
    
    if (len(embeddings_)) < 150:
        for _ in range((150-len(word_list))):
            embeddings_.append(np.zeros(300))
    
    return embeddings_

def get_sentence_embeddings_for_categories2(word_list):
    embeddings_ = []
    new_list = [word for word in word_list]
    embeddings_ = [
        np.mean([embeddings_index.get(wd, np.zeros(300)
    ) for wd in word], axis=0) for word in new_list]

    if len(embeddings_) < 70:
        for _ in range((70-len(embeddings_))):
            embeddings_.append(np.zeros(300) for _ in range(150))
    return embeddings_
description_embeddings2 = np.array(df.apply(lambda x: get_sentence_embeddings_for_categories(x['description']), axis=1).to_list())
niche_embeddings2 = np.array(df.apply(lambda x: get_sentence_embeddings_for_categories(x['niche']), axis=1).to_list())
category_embeddings2 = np.array(df.apply(lambda x: get_sentence_embeddings_for_categories(x['category']), axis=1).to_list())
business_tags_embeddings2 = np.array(df.apply(lambda x: get_sentence_embeddings_for_categories(x['business_tags']), axis=1).to_list())
title_embeddings2 = np.array(df_temp.apply(lambda x: get_sentence_embeddings_for_categories(x['title_description']), axis=1).to_list())

description_embeddings_original = np.array(df_temp.apply(lambda x: get_sentence_embeddings_for_categories(x['description']), axis=1).to_list())
most_important_words_embeddings = np.array(df_temp.apply(lambda x: get_sentence_embeddings_for_categories(x['most_important_words']), axis=1).to_list())


In [78]:
def get_sentence_embeddings_for_categories2(word_list):
    embeddings_ = []
    new_list = [word for word in word_list]
    if new_list == [[]]:
        for _ in range(100):
            embeddings_.append(np.zeros(300))
        return embeddings_
    embeddings_ = [
        np.mean([embeddings_index.get(wd.replace("-","_"), np.zeros(300)
    ) for wd  in word], axis=0) for word in new_list]
    or_lst = []
    
    
    if len(embeddings_) < 100:
        for _ in range((100-len(embeddings_))):
            embeddings_.append(np.zeros(300))
    

    for i in range(100):
        or_lst.append(len(embeddings_[i]))
    
    return embeddings_

business_tags_embeddings3_lemmatized = np.array(df2.apply(lambda x: get_sentence_embeddings_for_categories2(x['new_business_tags_lemmatized']), axis=1).to_list())

In [79]:
def get_sentence_embeddings_for_categories22(word_list):
    embeddings_ = []
    new_list = [word for word in word_list]
    if new_list == [[]]:
        for _ in range(100):
            embeddings_.append(np.zeros(300))
        return embeddings_
    embeddings_ = [
        np.array([embeddings_index.get(wd.replace("-","_"), np.zeros(300)
    ) for wd  in word]) for word in new_list]


    embeddings_ = []
   
   
    for word in new_list:

        emb_curr = []
        for wd in word:
            emb_curr.append(embeddings_index.get(wd.replace("-","_"), np.zeros(300)))
        for _ in range(10 - len(emb_curr)):
            emb_curr.append(np.zeros(300))
        embeddings_.append(np.array(emb_curr))

    
    
    if len(embeddings_) < 65:
        for _ in range((65-len(embeddings_))):
            emb_curr = []
            for __ in range(10):
                emb_curr.append(np.zeros(300))
            embeddings_.append(np.array(emb_curr))


    return embeddings_
business_tags_embeddings_full_lemmatized = np.array(df2.apply(lambda x: get_sentence_embeddings_for_categories22(x['new_business_tags_lemmatized']), axis=1).to_list())

In [80]:
df_temp['business_tags'] = df['business_tags'].copy()

### Finding Related Words and Handling Context

This part builds relationships between words to improve the matching and extraction of meaningful business tags.

### Main Steps

- **Load ConceptNet-based Relationships**:
  - Read relations like `IsA`, `HasContext`, and plural forms.
  - Identify very generic terms based on occurrence and WordNet specificity.
  
- **Expand Related Terms**:
  - Use `IsA` and reverse relationships to find similar or related words.
  - Calculate cosine similarity between embeddings to filter relevant related terms (threshold ≥ 0.4).

- **Contextual Matching**:
  - Functions like `generate_list_context` look for contextual relationships using the `HasContext` graph.
  - If direct relations are missing, it falls back to checking noun forms or plural-to-singular mappings.

- **Finding New Related Words**:
  - During matching, if a candidate word does not match strongly enough, nearby related words or contextually related terms are checked.
  - Adds correlated terms if sufficient similarity is found through embeddings and context relationships.

### Goal

To improve matching between business tags and descriptions by leveraging external relationship graphs, embeddings, and contextual knowledge — even when direct matches are weak.

In [81]:
import csv
import time
antonyms_words = set()
isA_relationship = {}
isA_reverse_relationship = {}

hasContext_relationship = {}
hasContext_reverse_relationship = {}


related_terms_for_antonyms = {}
convertPluralToSingular = {}
occuranceTermForParent = {}
antonyms_words,isA_relationship, isA_reverse_relationship, occuranceTermForParent, convertPluralToSingular, hasContext_relationship, hasContext_reverse_relationship, embeddings_index = open_filtered_assertions_file(antonyms_words,isA_relationship, isA_reverse_relationship, occuranceTermForParent, convertPluralToSingular, hasContext_relationship, hasContext_reverse_relationship, embeddings_index)      

In [82]:
from collections import deque 

def bfs(dict_terms, start):
    visited = []
    queue = deque([(start, 0)])

    while queue:
        node = queue.popleft()
        if node[0] not in visited:

            visited.append(node[0])
        
            if node[0] not in dict_terms.keys():
                return node[1]                
            for neighbor in dict_terms[node[0]]:
                if neighbor not in visited:
                    queue.append((neighbor, node[1]+1)) 
    return 0

In [83]:
original_terms_for_parent = occuranceTermForParent.copy()
occuranceTermForParent = dict(sorted(occuranceTermForParent.items(), key=lambda item: item[1]))
occuranceTermForParent = {k: v for k, v in occuranceTermForParent.items() if (v>=70 and  bfs(isA_relationship,k)<=1) or (v >=450 and bfs(isA_relationship,k)<=2) }
very_generic_terms = set([k for k, _ in occuranceTermForParent.items()])

In [84]:
def get_specificity(term):
    synsets = wn.synsets(term)
    if not synsets:
        return 0

    return max([len(path) for path in synsets[0].hypernym_paths()])

In [85]:
terms_for_look_for_beginning = {"prepared", "shaped", "solid", "liquid", "long", "light", "dirty", "busy", "organized","constructed", "processed"}
terms_for_look_for_beginning = {"prepared", "shaped", "processed"}
terms_for_look_for_end = {"object", "thing", "fluid", "thing", "matter"}
for original_term in original_terms_for_parent.keys():
    
    if ((original_term in occuranceTermForParent and occuranceTermForParent[original_term]>4) or (original_term not in occuranceTermForParent.keys())) and len(original_term.split("_")) > 1 and get_specificity(original_term)<4:
        is_generic_term = original_term.split("_")[0] in terms_for_look_for_beginning or (original_term.split("_")[-1] in terms_for_look_for_end and get_specificity("_".join(original_term.split("_")[:-1])) >1 and get_specificity("_".join(original_term.split("_")[:-1]))<5) 

        if is_generic_term:
            very_generic_terms.add(original_term)


In [86]:
related_words_to_a_word_similarity = {}

for word in isA_relationship:
    set_element = isA_relationship[word]
    set_element_temp = set_element.copy()
    
    for word2 in set_element:
        set_is_A_relationship = {}
        set_is_A_reverse_relationship = {}

        if word2 in isA_relationship.keys():
            set_is_A_relationship = isA_relationship[word2]
        if word2 in isA_reverse_relationship.keys()  and word2 not in very_generic_terms:
            set_is_A_reverse_relationship = isA_reverse_relationship[word2]
        set_element_temp.update(set_is_A_relationship)

        set_element_temp.update(set_is_A_reverse_relationship)

        if word in set_element_temp:
            set_element_temp.remove(word)

    if set_element_temp !=set():

        list_words = list(set_element_temp)
        current_word_embedding = (embeddings_index[word]).reshape(1,-1)
        faiss.normalize_L2(current_word_embedding)
        all_word_embeddings = np.array([embeddings_index[wd] if wd in embeddings_index else np.zeros(300) for wd in list_words]).astype(np.float32, order='C')
        faiss.normalize_L2(all_word_embeddings)
        our_values = np.dot(current_word_embedding, all_word_embeddings.T)[0]
        list_words_zip_for_antonyms = list(filter(lambda x: x[1]>=0.4, list(zip(list_words, our_values))))
        set_element_temp = set(map(lambda x: x[0], list_words_zip_for_antonyms))

    if word not in related_words_to_a_word_similarity.keys():
        related_words_to_a_word_similarity[word] = set_element_temp
    else:
        set_word_sim = related_words_to_a_word_similarity[word] 

        set_word_sim.update(set_element_temp)
        related_words_to_a_word_similarity[word] = set_word_sim

for value_word in related_words_to_a_word_similarity.keys():
    list_value_word = related_words_to_a_word_similarity[value_word]
    if value_word in isA_reverse_relationship.keys():
        for wd in isA_reverse_relationship[value_word]:
            list_value_word = set(filter(lambda x: x if wd not in x else '', related_words_to_a_word_similarity[value_word]))
            related_words_to_a_word_similarity[value_word] = list_value_word



In [87]:
def go_further(word_key_label, word):
    if word not in related_words_to_a_word_similarity.keys():
        return []
    return list(filter(lambda x: f"_{word_key_label}_" in x or word_key_label == x or f"{word_key_label}_" == x[0:(len(word_key_label)+1)] or f"_{word_key_label}" in x, related_words_to_a_word_similarity[word]))

In [88]:
def words_related_to_key_words(word_key_label, wd):
    if (len(wd)==1):
        return set()
    modify_word  = wd
  
    
    filtered_similarity_list = set()
    
    if modify_word in related_words_to_a_word_similarity:
        filtered_similarity_list = set(filter(lambda x: f"_{word_key_label}_" in x or word_key_label == x or f"{word_key_label}_" == x[0:(len(word_key_label)+1)] or f"_{word_key_label}" in x or go_further(word_key_label, x), related_words_to_a_word_similarity[modify_word]))
    if filtered_similarity_list == set() and word_key_label in convertPluralToSingular.keys() and len(convertPluralToSingular[word_key_label].split("_"))>1 and modify_word in related_words_to_a_word_similarity:
        word_key_label_new = convertPluralToSingular[word_key_label].split("_")[0]
        filtered_similarity_list = set(filter(lambda x: f"_{word_key_label_new}_" in x or word_key_label_new == x or f"{word_key_label_new}_" == x[0:(len(word_key_label_new)+1)] or f"_{word_key_label_new}" in x or go_further(word_key_label_new,x), related_words_to_a_word_similarity[modify_word]))
    return filtered_similarity_list


In [89]:
def is_list_relevant(similar_items, word2, embeddings_index):
    if similar_items != list():
        similar_items_embeddings = np.array([embeddings_index.get(wd, np.zeros(300)) for wd in similar_items],  dtype= np.float32, order='C')
        faiss.normalize_L2(similar_items_embeddings)
        values = np.dot(similar_items_embeddings, embeddings_index[word2].T)
        range_items = list(filter(lambda x: values[x]>=0.35, range(len(similar_items))))
        similar_items_temp = similar_items.copy()
        similar_items = [similar_items_temp[i] for i in range_items]
    return similar_items

In [90]:
def generate_list_context(word1, word2, hasContext_reverse_relationship, embeddings_index, idx=None):
    similar_items = []
    if word2 in hasContext_reverse_relationship.keys():
        similar_items = list(filter(lambda x: f"_{word1}_" in x or word1 == x or f"{word1}_" == x[0:(len(word1))+1] or f"_{word1}" in x, hasContext_reverse_relationship[word2]))
        similar_items = is_list_relevant(similar_items, word2, embeddings_index)
    noun_for_adj = generate_noun_for_adj(word2, embeddings_index)
    
    if similar_items == list() and noun_for_adj in hasContext_reverse_relationship.keys():
        similar_items = list(filter(lambda x: f"_{word1}_" in x or word1 == x or f"{word1}_" == x[0:(len(word1)+1)] or f"_{word1}" in x, hasContext_reverse_relationship[noun_for_adj]))
        similar_items = is_list_relevant(similar_items, word2, embeddings_index)

    return similar_items

In [91]:

def find_new_related_word(df2, word_description, bt3_emb_full, ind, wd_e, match_words_with_tf_idf_valuess, i, currated_list, hasContext_reverse_relationship, embeddings_index, idx):
    our_values = np.dot(bt3_emb_full[ind,:,:], wd_e.T)
    maximum_value = np.max(our_values)
    global correlated_terms
    
    if maximum_value < 0.1:
        return currated_list
        
    
    for idx2, word in enumerate(df2['new_business_tags_lemmatized'][idx][ind]):
        max_value_current = np.max(our_values[idx2])
        list_converted = list(our_values[idx2])


        index_val = list_converted.index(max_value_current)




        
        if len(word) == 1  or ((word, i) in match_words_with_tf_idf_valuess.keys() and match_words_with_tf_idf_valuess[(word, i)] <= 0.1) or ((word[:-1], i) in match_words_with_tf_idf_valuess.keys() and match_words_with_tf_idf_valuess[(word[:-1], i)] <= 0.1):
            continue
        
        if max_value_current > 0.5:
            currated_list.append([word])
            continue
        word1 = word
        word2 = word_description[index_val]

        similar_items = generate_list_context(word1, word2, hasContext_reverse_relationship, embeddings_index, idx)

        if similar_items != list():
            currated_list.append([word])
            correlated_terms.add((word, word_description[index_val]))
            correlated_terms.add((word_description[index_val], word))

            continue

        word2 = word
        word1 = word_description[index_val]

        similar_items = generate_list_context(word1, word2, hasContext_reverse_relationship, embeddings_index, idx)
        

        if similar_items != list():
            currated_list.append([word])
            correlated_terms.add((word, word_description[index_val]))
            correlated_terms.add((word_description[index_val], word))
            continue
        
        
        list_set_to_consider = list(words_related_to_key_words(word, word_description[index_val]))
        list_set_to_consider = is_list_relevant(list_set_to_consider, word_description[index_val], embeddings_index)

        if (list_set_to_consider == list() and len(word_description[index_val].split("_"))>1):
            list_splitted_words = word_description[index_val].split("_")
            list_splitted_emb = np.array([embeddings_index.get(wd, np.zeros(300)) for wd in list_splitted_words],  dtype= np.float32, order='C')
            
            faiss.normalize_L2(list_splitted_emb)
            word1_embeddings = embeddings_index.get(wd, np.zeros(300))
            
            our_valuezzz = np.dot(list_splitted_emb, word1_embeddings.T)


            max_value_current = np.max(our_valuezzz)
            list_converted = list(our_valuezzz)
            index_val_cur = list_converted.index(max_value_current)

            list_set_to_consider = list(words_related_to_key_words(word, word_description[index_val].split("_")[index_val_cur]))
            list_set_to_consider = is_list_relevant(list_set_to_consider, word_description[index_val].split("_")[index_val_cur], embeddings_index)

            if(list_set_to_consider != list()):
                currated_list.append([word])
                correlated_terms.add((word, word_description[index_val].split("_")[index_val_cur]))
                correlated_terms.add((word_description[index_val].split("_")[index_val_cur], word))
                continue
        elif list_set_to_consider != list():
            currated_list.append([word])
            correlated_terms.add((word, word_description[index_val]))
            correlated_terms.add((word_description[index_val], word))
            continue
    
    return currated_list

### Refining Business Tags Based on Description and Context

This part further refines the `business_tags` for each company by comparing them to the description and niche information using embeddings and relationship graphs.

### Main Steps

- **Embedding Similarity**:
  - Compare business tag embeddings with description embeddings.
  - If similarity is too low (< 0.50), attempt to validate or replace tags using contextual relationships and related terms.

- **Handling Short Descriptions**:
  - If the description is short (few words), instead of comparing each tag individually, we compare the **whole list** of `business_tags` embeddings together.
  - `convertPluralToSingular` is used here to convert tags into alternative forms that may exist in the embeddings index, improving matching reliability.

- **Context and Related Words**:
  - Use `HasContext` and `IsA` graphs to search for related or supportive words when direct similarity fails.
  - Split compound words when necessary to find partial matches.

- **Fallback to Niche**:
  - If matching with the description fails, fallback to matching against the company's niche.
  - Companies requiring this fallback are tracked in `do_exception_for_niche_for_these_companies`.

- **Final Cleaning**:
  - Correct spelling variations (e.g., `er` → `re`, `ou` → `o`).
  - Filter out stopwords, punctuation, and irrelevant terms.
  - Update the `business_tags` with cleaned words, bigrams, and trigrams.

- **Correlations Tracking**:
  - Contextually matched pairs are stored in a global `correlated_terms` set for future use.

### Goal

Ensure that each company has accurate and meaningful business tags, even for short or weak descriptions, by using fallback logic, relationship graphs, and smart term corrections.

In [92]:
keys_sector_elements = {}
for elements in set_sector_elements:
    keys_sector_elements[elements] = [0]

In [93]:
which_terms_are_associated_with_value = {}

def find_values_that_may_be_related(word, lemmatized_business_tags, bt3_emb, wd_e, idx):
    no_values = 0
    global which_terms_are_associated_with_value
    values_emb = np.dot(bt3_emb, wd_e.T)
    max_value = df.at[idx, 'sector']!='' and np.max(np.dot(embeddings_index[word], embeddings_index[df.at[idx, 'sector'].replace(" ","_").lower()].T))
    for i in range(len(lemmatized_business_tags)):


       

        max_our_values = np.max(values_emb[i])
        element = keys_sector_elements.get(word, None)
        okay = word in lemmatized_business_tags[i]
        
        if okay and (max_our_values>0.4 or ((element != None or max_value > 0.6) and max_our_values>0.25)):
            
         
            word_index = lemmatized_business_tags[i].index(word)
            if word_index >= len(df2.at[idx,'tags_for_bt'][i]):
                continue
            find_tag = df2.at[idx,'tags_for_bt'][i][word_index]
            if find_tag == "JJ" or find_tag == "JJR" or find_tag == "JJS" or element != None or max_value > 0.6:
                if (word, idx) not in which_terms_are_associated_with_value.keys():
                    list_val = lemmatized_business_tags[i].copy()
                    list_val.remove(word)
                    which_terms_are_associated_with_value[(word, idx)]  = list_val
                else:
                    list_val = which_terms_are_associated_with_value[(word, idx)]
                    list_val.extend(lemmatized_business_tags[i].copy())
                    list_val.remove(word)
                    which_terms_are_associated_with_value[(word, idx)] = list(set(list_val))
                no_values += 1
    
    return no_values

In [94]:
i=0
ok3=0
correlated_terms = set()
do_exception_for_niche_for_these_companies = set()
for idx, rows in df.iterrows():
    de2 = np.array(description_embeddings_original[i], dtype= np.float32, order='C')
    faiss.normalize_L2(de2)
    bte2 = np.array(business_tags_embeddings2[i], dtype= np.float32, order='C')
    faiss.normalize_L2(bte2)



    val_business_tags = np.einsum('jk,lk->jl', bte2, de2)
    list_business_tags = [bt for bt in rows['business_tags']]
    copy_df_business = df['business_tags'][idx].copy()

    our_list = [word for word in df['business_tags'][idx]]
    our_list_original = [word for word in df_temp['description'][idx]]

    maximum_value_list = []
    currated_list = []
    
    if len(df_temp['description'][idx])>3:
        bt3_emb_lemma_=  np.array(business_tags_embeddings3_lemmatized[i], dtype= np.float32, order='C')
        wd_e = np.array([embeddings_index[wd] for wd in df['description'][idx]], dtype= np.float32, order='C')



        
        for j in range(len(df['business_tags'][idx])):

    
        
            if (max(val_business_tags[j]) < 0.50):
                index_val_single = list(val_business_tags[j]).index(np.max(val_business_tags[j]))

             

                if np.max(val_business_tags[j]) == 0:
                    del copy_df_business[list_business_tags[j]]
                    continue


                
                word1 = list_business_tags[j]
                word2 = our_list_original[index_val_single]

                similar_items = generate_list_context(word1, word2, hasContext_reverse_relationship, embeddings_index)
    
                if similar_items != list():
                    correlated_terms.add((word1, word2))
                    correlated_terms.add((word2, word1))
                    continue
                word2 = list_business_tags[j]
                word1 = our_list_original[index_val_single]

                similar_items = generate_list_context(word1, word2, hasContext_reverse_relationship, embeddings_index)
                
        
                if similar_items != list():
                    correlated_terms.add((word1, word2))
                    correlated_terms.add((word2, word1))
                    continue
                list_set_to_consider = list(words_related_to_key_words(list_business_tags[j], our_list_original[index_val_single]))
                list_set_to_consider = is_list_relevant(list_set_to_consider, our_list_original[index_val_single], embeddings_index)

                if (list_set_to_consider == list() and len(our_list_original[index_val_single].split("_"))>1):
                    list_splitted_words1 = our_list_original[index_val_single].split("_")
                    list_splitted_emb1 = np.array([embeddings_index.get(wd, np.zeros(300)) for wd in list_splitted_words1],  dtype= np.float32, order='C')
                    faiss.normalize_L2(list_splitted_emb1)
                    word1_embeddings = embeddings_index.get(wd, np.zeros(300))
                    
                    our_valuezzz = np.dot(list_splitted_emb1, word1_embeddings.T)


                    max_value_current = np.max(our_valuezzz)
                    list_converted = list(our_valuezzz)
                    index_val_current = list_converted.index(max_value_current)

                    list_set_to_consider = list(words_related_to_key_words(list_business_tags[j], our_list_original[index_val_single].split("_")[index_val_current]))
                    list_set_to_consider = is_list_relevant(list_set_to_consider, our_list_original[index_val_single].split("_")[index_val_current], embeddings_index)
                   
                    if(list_set_to_consider == list()):
                        val = find_values_that_may_be_related(list_business_tags[j], df2['new_business_tags_lemmatized'][idx], bt3_emb_lemma_, wd_e, idx)

                        if val == 0:
                            del copy_df_business[list_business_tags[j]]
                        else:
                            copy_df_business[list_business_tags[j]] = val
                    else:
                        correlated_terms.add((list_business_tags[j], our_list_original[index_val_single].split("_")[index_val_current]))
                        correlated_terms.add((our_list_original[index_val_single].split("_")[index_val_current], list_business_tags[j]))
                elif list_set_to_consider == list():
                    val = find_values_that_may_be_related(list_business_tags[j], df2['new_business_tags_lemmatized'][idx], bt3_emb_lemma_, wd_e, idx)
                    if val == 0:
                        del copy_df_business[list_business_tags[j]]
                    else:
                        copy_df_business[list_business_tags[j]] = val
                else:
                    correlated_terms.add((list_business_tags[j], our_list_original[index_val_single]))
                    correlated_terms.add((our_list_original[index_val_single], list_business_tags[j]))

                
                
            

            if len(copy_df_business) <= 1:
                if (max(val_business_tags[j]) < 0.50):
                    del copy_df_business[list_business_tags[j]]
            
        df.at[idx, 'business_tags'] = copy_df_business


    elif len(df_temp['description'][idx]) <=3 and df_temp['business_tags'][idx]!=Counter():

            wd_e = np.array([embeddings_index[wd] for wd in df['description'][idx]], dtype= np.float32, order='C')
            bt_emb =  np.array([embeddings_index[wd] for wd in df['business_tags'][idx]], dtype= np.float32, order='C')
            bt3_emb =  np.array(business_tags_embeddings3_lemmatized[i], dtype= np.float32, order='C')
            bt3_emb_full =  np.array(business_tags_embeddings_full_lemmatized[i], dtype= np.float32, order='C')


            
            faiss.normalize_L2(bt_emb)
            faiss.normalize_L2(bt3_emb)

            currated_list = []
            word_list_new = [wd for wd in df2['new_business_tags_lemmatized'][idx]]

            if df['description'][idx] != Counter():
                faiss.normalize_L2(wd_e)
                word_description = [wd for wd in df['description'][idx]]
                list_a = []
                for ind in range(len(df2['new_business_tags_lemmatized'][idx])):
                    a= max(np.dot(bt3_emb, wd_e.T)[ind])

                    maximum_value_list.append(a)

                    if (a> 0.35 and len(word_list_new[ind])!=1) or a> 0.5:
                        currated_list.append(word_list_new[ind])
                    else:
                        currated_list3 = currated_list.copy()
                        currated_list = find_new_related_word( df2, word_description, bt3_emb_full, ind, wd_e, match_words_with_tf_idf_valuess, i, currated_list3, hasContext_reverse_relationship, embeddings_index, idx)
                                                        
                    list_a.append(a)

            if currated_list == list():
                ne2 = np.array(niche_embeddings2[i], dtype= np.float32, order='C')
                faiss.normalize_L2(ne2)
                val_business_tags = np.einsum('jk,lk->jl', bte2, ne2)
                word_business_tags = [wd for wd in df['business_tags'][idx]]
                word_niche = [wd for wd in df_temp['niche'][idx]]

                n_e = np.array([embeddings_index[wd] for wd in df_temp['niche'][idx]], dtype= np.float32, order='C')

                for ind in range(len(df2['new_business_tags_lemmatized'][idx])):
                    a= max(np.dot(bt3_emb, ne2.T)[ind])

                    maximum_value_list.append(a)

                    if (a> 0.35 and len(word_list_new[ind])!=1) or a> 0.5:
                        currated_list.append(word_list_new[ind])
                    for ind in range(len(df2['new_business_tags_lemmatized'][idx])):
                        a= max(np.dot(bt3_emb, n_e.T)[ind])

                        maximum_value_list.append(a)

                        if (a> 0.35 and len(word_list_new[ind])!=1) or a> 0.5:
                            currated_list.append(word_list_new[ind])
                        else:
                            currated_list3 = currated_list.copy()
                            currated_list = find_new_related_word( df2, word_niche, bt3_emb_full, ind, n_e, match_words_with_tf_idf_valuess, i, currated_list3, hasContext_reverse_relationship, embeddings_index, idx)
                if(currated_list==list()):
                    do_exception_for_niche_for_these_companies.add(idx)

            word_list = [wd for wd in df['business_tags'][idx]]
            currated_list = sum(currated_list, [])
            currated_list = list(map(lambda x: replace_upper_word([x], acronym_dict)[0], currated_list))
            for idx1, wd in enumerate(currated_list):

                if wd not in embeddings_index.keys() and wd in convertPluralToSingular and convertPluralToSingular[wd] in embeddings_index.keys():
                    currated_list[idx1] =convertPluralToSingular[wd]
                elif wd.replace("er", "re") in df.at[idx, 'description'] and wd.replace("er","re")!=wd and cosine_similarity(embeddings_index[wd.replace("er","re")].reshape(1,-1), embeddings_index[wd].reshape(1,-1))>0.85:
                    currated_list[idx1] = wd.replace("er", "re")
                elif wd.replace("ou", "o") in  df.at[idx, 'description'] and wd.replace("ou","o")!=wd and cosine_similarity(embeddings_index[wd.replace("ou","o")].reshape(1,-1), embeddings_index[wd].reshape(1,-1))>0.85:
                    currated_list[idx1] = wd.replace("ou", "o")
                elif wd in convertPluralToSingular and convertPluralToSingular[wd] in  df.at[idx, 'description'] and cosine_similarity(embeddings_index[convertPluralToSingular[wd]].reshape(1,-1), embeddings_index[wd].reshape(1,-1))>0.75:
                    currated_list[idx1] =convertPluralToSingular[wd]
            
            tokens = list(filter(lambda x: x if x.lower() not in stop_words.STOP_WORDS and x.lower() in embeddings_index.keys() and x.lower() not in string.punctuation and  (col!="description" or x.lower() not in is_person_or_location_or_irrevalent_terms2) and (not x[0].isnumeric() and not re.match(pattern_float, x[0])) else '', currated_list))

            tokens = list(filter(lambda x: x.lower() not in is_person_or_location_or_irrevalent_terms2, tokens)) 
            
            if tokens != [] or len(df['description'][idx])>3:
                df.at[idx, 'business_tags'] = Counter(tokens)  
                df.at[idx, 'business_tags'].update(new_ngrams(tokens, 2))
                df.at[idx, 'business_tags'].update(new_ngrams(tokens, 3))
                
                            

    i+=1



In [95]:
copy_business_tags = df['business_tags'].copy()

### Standardizing and Filtering Business Tags with TF-IDF Scores

This section further refines `business_tags` by correcting word forms and validating tags based on TF-IDF scores.

### Main Steps

- **Word Form Correction**:
  - Automatically adjusts common word variations like:
    - Removing suffixes (`-ing`, `-ies`, `-es`, `-s`, `-ed`, `-ly`) when appropriate.
    - Replaces the original word with its corrected form if it exists in `embeddings_index` and has a valid TF-IDF score.
  - Falls back to **lemmatization** if no direct match is found.

- **TF-IDF Filtering**:
  - Each business tag is validated against `match_words_with_tf_idf_valuess`.
  - Tags without a valid TF-IDF score are removed from the business tags list.

- **Tracking TF-IDF Extremes**:
  - For each company, the minimum and maximum TF-IDF values among their business tags are stored in `min_val_list` and `max_val_list`.

- **Fallback for Plural Forms**:
  - If a tag's TF-IDF value is missing, attempts are made to look it up through the `convertPluralToSingular` mapping.

### Goal

Ensure business tags are clean, normalized, and strongly relevant according to TF-IDF importance, improving the quality of downstream text analysis.

In [96]:
min_val_list = [2] * len(df['business_tags'])
max_val_list = [-2] * len(df['business_tags'])
z=0
for idx, row in df.iterrows():
    min_val = 2
    max_val = -2
    copy_df_business = df.at[idx, 'business_tags'].copy()
    row_new = [wd for wd in df.at[idx, 'business_tags'].most_common()]
    for i in row_new:
        if (i[0],z) not in match_words_with_tf_idf_valuess.keys():
           
            
            if(len(i[0])>3 and i[0][-3:]=="ing") and  i[0][:-3] in embeddings_index.keys() and (i[0][:-3], z) in match_words_with_tf_idf_valuess.keys():
                copy_df_business.update(Counter([i[0][:-3]]))
                del copy_df_business[i[0]]
            elif(len(i[0])>4 and i[0][-3:]=="ing") and  i[0][:-3] in embeddings_index.keys() and (i[0][:-4], z) in match_words_with_tf_idf_valuess.keys():
                copy_df_business.update(Counter([i[0][:-3]]))
                del copy_df_business[i[0]]
            elif(len(i[0])>3 and i[0][-3:]=="ies") and  i[0][:-3]+"y" in embeddings_index.keys()  and (i[0][:-3]+"y", z) in match_words_with_tf_idf_valuess.keys():
                copy_df_business.update(Counter([i[0][:-3]+"y"]))
                del copy_df_business[i[0]]
            elif(len(i[0])>2 and i[0][-3:]=="es") and  i[0][:-2] in embeddings_index.keys()  and (i[0][:-2], z) in match_words_with_tf_idf_valuess.keys():
                copy_df_business.update(Counter([i[0][:-2]]))
                del copy_df_business[i[0]]
            
            elif(len(i[0])>1 and i[0][-1:]=="s" and i[0][:-1] in embeddings_index.keys()) and (i[0][:-1], z) in match_words_with_tf_idf_valuess.keys():
             
                copy_df_business.update(Counter([i[0][:-1]]))
                del copy_df_business[i[0]]
            elif(len(i[0])>2 and i[0][-2:]=="ed" and i[0][:-1] in embeddings_index.keys()) and (i[0][:-1], z) in match_words_with_tf_idf_valuess.keys():
                copy_df_business.update(Counter([i[0][:-1]]))
                del copy_df_business[i[0]]
            elif(len(i[0])>2 and i[0][-2:]=="ly" and i[0][:-2] in embeddings_index.keys()) and (i[0][:-2], z) in match_words_with_tf_idf_valuess.keys():
                copy_df_business.update(Counter([i[0][:-2]]))
                del copy_df_business[i[0]]
            elif(len(i[0])>3 and i[0][-2:]=="ly" and i[0][:-3] in embeddings_index.keys()) and (i[0][:-3], z) in match_words_with_tf_idf_valuess.keys():
                copy_df_business.update(Counter([i[0][:-3]]))
                del copy_df_business[i[0]]
            else:

                doc = nlp(i[0])
                text_element = doc[0].lemma_
                if (text_element, z) in match_words_with_tf_idf_valuess.keys() and text_element in embeddings_index.keys():
                    copy_df_business.update(Counter([text_element]))
            
                del copy_df_business[i[0]]
    
    df.at[idx, 'business_tags'] = copy_df_business
    row_new = [wd for wd in df.at[idx, 'business_tags'].most_common()]
    for i in row_new:   
        value = None
        
        
        if (i[0], z) in match_words_with_tf_idf_valuess.keys():
            value = match_words_with_tf_idf_valuess[(i[0], z)]
        else:
            del copy_df_business[i[0]]
            continue

        
        if min_val > value:
            min_val = value


        if max_val < value:
            max_val = value
    
    df.at[idx, 'business_tags'] = copy_df_business
    row_new = [wd for wd in df.at[idx, 'business_tags'].most_common()]


    for i in row_new:
    
            
        value = None
        if (i[0], z) in match_words_with_tf_idf_valuess.keys():
            value = match_words_with_tf_idf_valuess[(i[0], z)]
        else:
            value = match_words_with_tf_idf_valuess[(convertPluralToSingular[i[0]], z)]

        if min_val > value:
            min_val = value
        if max_val < value:
            max_val = value

    min_val_list[z] = min_val
    max_val_list[z] = max_val
    z+=1


In [97]:
df['business_tags'] = [ (Counter({k:v for k,v in rows['business_tags'].items() if match_words_with_tf_idf_valuess[(k, idx)] >=0.06 and k not in most_used_sector_terms  })) for idx, (_, rows) in enumerate(df.iterrows()) ]

### Final Business Tags Filtering Using TF-IDF and Embedding Similarity

This final step filters weak business tags by analyzing their TF-IDF scores and embedding similarities compared to stronger words.

### Main Steps

- **TF-IDF Thresholding**:
  - Compute dynamic thresholds (25% and 75%) between the minimum and maximum TF-IDF scores for each company.
  - Split business tags into **high TF-IDF words** and **low TF-IDF words** based on these thresholds.

- **Similarity Checking**:
  - For each low TF-IDF tag:
    - Compare its embedding against the high TF-IDF tags.
    - If similarity is low (< 0.15), further checks are performed:
      - Check if parts of compound words (split by "_") match better.
      - Check semantic relations using `IsA` relationships.
      - Compare to "most important words" embeddings if necessary.

- **Tag Deletion**:
  - If no strong relationship is found after all checks, the low-quality tag is **deleted** from the business tags list.

- **Special Handling**:
  - Companies flagged in `do_exception_for_niche_for_these_companies` are skipped during this phase.

### Goal

Aggressively clean business tags by removing weak or loosely related terms, ensuring that final tags are highly relevant and semantically connected to strong description keywords.

In [98]:
index_cur = 0
for idx, row in df.iterrows():

    min25th = (min_val_list[index_cur] + 0.25 * (max_val_list[index_cur]-min_val_list[index_cur]))

    max25th = (max_val_list[index_cur] - 0.50 * (max_val_list[index_cur]-min_val_list[index_cur]))
    high_tf_idf_word = []
    low_tf_idf_word = []

    for word in (df['business_tags'][idx]+df['description'][idx]):
        if match_words_with_tf_idf_valuess[(word, index_cur)] > max25th:
            high_tf_idf_word.append(word)
        
    copy_df_business = df.at[idx, 'business_tags'].copy()

    if idx in do_exception_for_niche_for_these_companies:
        index_cur+=1
        continue

    for word in (df['business_tags'][idx]):    
        if match_words_with_tf_idf_valuess[(word, index_cur)] < min25th:
            low_tf_idf_word.append(word)
 
    if low_tf_idf_word != [] and high_tf_idf_word != []:
        low_emb = np.array([embeddings_index[wd] for wd in low_tf_idf_word], dtype= np.float32, order='C')
        faiss.normalize_L2(low_emb)

        high_emb =  np.array([embeddings_index[wd] for wd in high_tf_idf_word], dtype= np.float32, order='C')
        faiss.normalize_L2(high_emb)

        values_score = np.dot(low_emb, high_emb.T)

        for i in range(len(low_tf_idf_word)):
            value_maximum = np.max(values_score[i])
            index_1 = list(values_score[i]).index(value_maximum)
            
            if value_maximum < 0.15:

                


                
                value_element_max = None
                list_elements = []

                

                if (len(low_tf_idf_word[i].split("_"))>1):
                    list_splitted_words1 = low_tf_idf_word[i].split("_")
                    list_splitted_emb1 = np.array([embeddings_index.get(wd, np.zeros(300)) for wd in list_splitted_words1],  dtype= np.float32, order='C')
                    faiss.normalize_L2(list_splitted_emb1)
                    word1_embeddings = embeddings_index.get(high_tf_idf_word[index_1], np.zeros(300))
                    
                    our_valuezzz = np.dot(list_splitted_emb1, word1_embeddings.T)


                    max_value_current = np.max(our_valuezzz)

                    

                    
                    if max_value_current > 0.25:
                        continue



                if high_tf_idf_word[index_1] in isA_relationship.keys():
                    list_elements_high = [wd2 for wd2 in isA_relationship[high_tf_idf_word[index_1]]]
                    value_element_max = np.array([embeddings_index[wd] for wd in list_elements_high], dtype= np.float32, order='C')
                    
                    faiss.normalize_L2(value_element_max)
                else:
                    value_element_max = embeddings_index[high_tf_idf_word[index_1]].reshape(1,-1)



                current_element = None
                if low_tf_idf_word[i]  in isA_relationship.keys():
                    list_elements = list(isA_relationship[low_tf_idf_word[i]])

                    current_element = np.array([embeddings_index[wd1] for wd1 in list_elements], dtype= np.float32, order='C')
                    faiss.normalize_L2(current_element)
                   
                else:
                    current_element = embeddings_index[low_tf_idf_word[i]].reshape(1,-1)
                    list_elements = [low_tf_idf_word[i]]
                
                
                
                values_score2 = np.dot(current_element, value_element_max.T)
                max_v = -1
                for j, _ in enumerate(list_elements):
                    val_ = np.max(values_score2[j])

                    if max_v < val_:
                        max_v =val_
                if max_v < 0.3:
                    miw_e =  np.array(most_important_words_embeddings[index_cur], dtype= np.float32, order='C')
                    faiss.normalize_L2(miw_e)
                    our_value = np.max(np.dot(low_emb, miw_e.T)[i])

                    if our_value > 0.35:
                        continue
                    del copy_df_business[low_tf_idf_word[i]]

    
    df.at[idx, 'business_tags'] = copy_df_business
    index_cur+=1







In [99]:
idx_z=0
for lists_x in copy_business_tags:
    wd_e = np.array([embeddings_index[wd] for wd in df['description'][idx_z]], dtype= np.float32, order='C')

    for x in lists_x:
        if x not in df.at[idx_z, 'business_tags']:
            words_list = which_terms_are_associated_with_value.get((x, idx_z), [])
            if words_list != []:
                for word1 in words_list:
                    if x in set_sector_elements and np.dot(embeddings_index[x], embeddings_index[df.at[idx_z, 'sector'].replace(" ","_").lower()].T) < 0.4 and np.max(np.dot(embeddings_index[x], wd_e.T))>0.4:
                           df.at[idx_z, 'business_tags'].update(Counter([x]))
                    elif word1 in df.at[idx_z, 'business_tags']:    
                        df.at[idx_z, 'business_tags'].update(Counter([x]))
                        break

    idx_z+=1

In [100]:
business_tags_embeddings_edited = np.array(df.apply(lambda x: get_sentence_embeddings_for_categories(x['business_tags']), axis=1).to_list())

In [101]:
generic_domain_terms = {"commercial", "residential", "family", "general", "corporate", "industrial", "building", "ornamental", "tilt", "precast"}

### Refining Niche and Category Tags Based on Embedding Similarity

This step further cleans the `niche` and `category` fields by comparing their relevance to the description, business tags, and each other using embeddings and adaptive thresholds.

### Main Steps

- **Embedding Similarity Comparisons**:
  - Compare `niche` and `category` embeddings with description and business tags embeddings.
  - Compute similarity scores using FAISS and cosine similarity.

- **Adaptive Thresholding**:
  - Dynamic thresholds (around 0.3–0.375) are set based on overall similarity scores for each company.
  - Lower thresholds for generic domain terms to allow stricter filtering.

- **Niche Tag Filtering**:
  - Remove niche tags with low maximum similarity to description or business tags.
  - If some niche terms are borderline, they are reconsidered based on the average similarity across the set.

- **Category Tag Filtering**:
  - Remove category tags with weak similarity to description, business tags, or niche terms.
  - Special fallback handling for companies needing niche-based matching.
  
- **Special Handling**:
  - Companies listed in `do_exception_for_niche_for_these_companies` are treated differently by relying more on niche-based comparisons.
  - Words deleted can be reconsidered if the average similarity across terms is still strong.

### Goal

To ensure that both `niche` and `category` tags are contextually meaningful and strongly aligned with the company’s description and business tags, improving label precision.

In [102]:

i=0
for idx, rows in df.iterrows():
    de3 = np.array(description_embeddings_original[i], dtype= np.float32, order='C')
    faiss.normalize_L2(de3)
    bte3 = np.array(business_tags_embeddings_edited[i], dtype= np.float32, order='C')
    faiss.normalize_L2(bte3)
    ce3 = np.array(category_embeddings2[i], dtype=np.float32, order='C')
    faiss.normalize_L2(ce3)
    ne2 = np.array(niche_embeddings2[i], dtype = np.float32, order='C')
    faiss.normalize_L2(ne2)




    val_category = np.einsum('jk,lk->jl', ce3, de3)
    val_category_bt = np.einsum('jk,lk->jl', ce3, bte3)
    val_niche_bt = np.einsum('jk,lk->jl', ne2, bte3)

    val_niche = np.einsum('jk,lk->jl', ne2, de3)
    list_category = [bt for bt in rows['category']]
    list_niche = [bt for bt in rows['niche']]
    list_description = [bt for bt in df_temp['description'][idx]]
    list_business = [bt for bt in rows['business_tags']]

    avg_number_list = []
    words_to_consider_again = []

    if idx not in do_exception_for_niche_for_these_companies and (df.at[idx, 'business_tags']+df.at[idx, 'description']) != Counter():
        threshold=0.3
        mean_val = None
        if similarity_matrix3[i][i] > 0.2:
            threshold=0.3
        else:
            threshold=0.375

        
        
        copy_df_niche = df['niche'][idx].copy()
        sector_word_to_consider = []
        for j in range(len(df['niche'][idx])):
            val_niche_2 = val_niche[j][0:len(df['description'][idx])]
            
            if threshold!=0.3:

                if len(val_niche_2) != 0:
                    mean_val = np.mean(val_niche_2)
                else:
                    mean_val = 0
                
            mav_val_niche = max(val_niche[j])
            mav_val_niche2 = max(val_niche_bt[j])

            
            val_niche_2bt = list(filter(lambda x: x!=0, list(val_niche_bt[j])))

            list_max_val_nc = []
            if df['description'][idx] != '':
                list_max_val_nc = list(zip(val_niche_2, range(len(val_niche_2))))
            list_max_val_nc2 = list(zip(val_niche_2bt, range(len(val_niche_2bt))))
            list_max_val_nc.sort(reverse=True)
            list_max_val_nc2.sort(reverse=True)
            
            list_max_val_nc = list(filter(lambda x: x[1] < len(list_description) and (match_average_words_with_tf_idf_valuess[list_description[x[1]]]/match_number_words_with_tf_idf_valuess[list_description[x[1]]])>0.065,list_max_val_nc))
            list_max_val_nc2 = list(filter(lambda x: x[1] < len(list_business) and  (match_average_words_with_tf_idf_valuess[list_business[x[1]]]/match_number_words_with_tf_idf_valuess[list_business[x[1]]])>0.065,list_max_val_nc2))
            
            list_max_val_nc = list(map(lambda x: x[0], list_max_val_nc))
            list_max_val_nc2 = list(map(lambda x: x[0], list_max_val_nc2))
            
            list_max_val_nc.append(0.0)
            list_max_val_nc2.append(0.0)
            if list_max_val_nc != []:
                mav_val_niche = max(max(list_max_val_nc), max(list_max_val_nc2))
            else:
                mav_val_niche =  max(list_max_val_nc2)

            if df.at[idx, 'sector'] != '':
                cos_similarity_sector = np.dot(embeddings_index[list_niche[j]], embeddings_index[df.at[idx,'sector'].lower().replace(" ", "_")].T)
            else:
                cos_similarity_sector = 0
            
            avg_number_list.append(mav_val_niche)

            if list_niche[j] in generic_domain_terms and threshold==0.3:
                threshold-=0.05
 
            if cos_similarity_sector > 0.85:
                sector_word_to_consider.append(list_niche[j])


            if(mav_val_niche < threshold and threshold<=0.3) or (threshold>0.3 and (mean_val!=None and mav_val_niche < threshold or mean_val<0.15)):
                
                if (mav_val_niche < 0.7 or (mav_val_niche < 0.85 and mean_val!=None and mean_val<0.13)):
                    del copy_df_niche[list_niche[j]]
            elif (mav_val_niche < threshold+0.05 and threshold<=0.3):
                del copy_df_niche[list_niche[j]]
                words_to_consider_again.append(list_niche[j])
            
            if list_niche[j] in generic_domain_terms and threshold==0.25:
                threshold+=0.05

            
            df.at[idx, 'niche'] = copy_df_niche

    if len(avg_number_list)!=0:
        mean_avg_number_list = np.mean(avg_number_list)
    else:
        mean_avg_number_list = 0
    if words_to_consider_again != list() and avg_number_list!=list() and mean_avg_number_list >=0.55:
        df.at[idx, 'niche'].update(Counter(words_to_consider_again))
    if df.at[idx, 'niche'] != Counter():
        sector_word_to_consider = list(filter(lambda x: x not in df.at[idx, 'niche'], sector_word_to_consider))
        df.at[idx, 'niche'].update(Counter(sector_word_to_consider))

    
        
    avg_number_list = []
    words_to_consider_again = []
    if similarity_matrix2[i][i] < 0.5:
        threshold=0.3

        mean_val = None
        max_niche_value = None

        stay_in_loop = 1
        copy_df_category = df['category'][idx].copy()
        mav_val_category = -1
        if idx in do_exception_for_niche_for_these_companies:
            ne3 = np.array([embeddings_index[wd] for wd in df['niche'][idx]], dtype= np.float32, order='C')
            val_category_nc = np.einsum('jk,lk->jl', ce3, ne3)
        
        if similarity_matrix2[i][i] > 0.2:
            threshold=0.3
        else:
            threshold=0.375
            if df['niche'][idx] != Counter():
                ne3 = np.array([embeddings_index[wd] for wd in df['niche'][idx]], dtype= np.float32, order='C')
                val_category_nc = np.einsum('jk,lk->jl', ce3, ne3)
                list_niche = [bt for bt in df.at[idx, 'niche']]
            else:
                ne3 = np.array([np.zeros(300)], dtype= np.float32, order='C')
                val_category_nc = np.einsum('jk,lk->jl', ce3, ne3)
                list_niche = [bt for bt in df.at[idx, 'niche']]

                
            
    
        for j in range(len(df['category'][idx])):
            x, y = None, None
            x_list, y_list = [], []


            


            if idx not in do_exception_for_niche_for_these_companies:
                x = val_category_bt[j]
                y = val_category[j]

                if len(df.at[idx, 'description'])==0:
                    y=[0]
                
                y_list = list_description
                if list_description== '':
                    y_list = []
                x_list = list_business
                
                mav_val_category = max(max(val_category_bt[j]), max(val_category[j]))
            else:

                
                x = val_category_nc[j]
                y = val_category[j]
                if len(df['description'][idx])==0:
                    x=[0]
               
                y_list = list_description
                if list_description== '':
                    y_list = []
                
                x_list = list_niche
                mav_val_category = max(max(val_category_nc[j]), max(val_category[j]))
           
            x = list(filter(lambda x: x!=0, list(x)))
            y = list(filter(lambda x: x!=0, list(y)))
            
            if threshold!=0.3:
                if len(y)!=0:
                    mean_val = np.mean(y)
                else:
                    mean_val = 0

                max_niche_value = np.max(val_category_nc[j])
    
            x1 = list(zip(x, range(len(x))))
            y1 = list(zip(y, range(len(y))))
            x1.sort(reverse=True)
            y1.sort(reverse=True)

            
            x1 = list(filter(lambda z: z[1] < len(x_list) and (match_average_words_with_tf_idf_valuess[x_list[z[1]]]/match_number_words_with_tf_idf_valuess[x_list[z[1]]])>0.065,x1))
            y1 = list(filter(lambda z: z[1] < len(y_list) and  (match_average_words_with_tf_idf_valuess[y_list[z[1]]]/match_number_words_with_tf_idf_valuess[y_list[z[1]]])>0.065,y1))
            
            x1 = list(map(lambda z: z[0], x1))
            y1 = list(map(lambda z: z[0], y1))
            
            x1.append(0.0)
            y1.append(0.0)
            mav_val_category = max(max(x1), max(y1))

            if list_category[j] in generic_domain_terms and threshold==0.3:
                threshold-=0.05

            
            if(mav_val_category < threshold and threshold<=0.03) or (threshold>0.3 and mean_val!=None and (mav_val_category < threshold or mean_val<0.15)):
                if (mav_val_category < 0.7 or (mav_val_category < 0.85 and mean_val!=None and mean_val<0.13)) and (max_niche_value==None or max_niche_value<0.55):
                    del copy_df_category[list_category[j]]
            elif (mav_val_category < threshold+0.05 and threshold<=0.3):
                
                del copy_df_category[list_category[j]]
                words_to_consider_again.append(list_category[j])
            
            if list_category[j] in generic_domain_terms and threshold==0.25:
                threshold+=0.05
        


        
        df.at[idx, 'category'] = copy_df_category
        if len(avg_number_list) != 0:
            mean_avg_number_list = np.mean(avg_number_list)
        else:
            mean_avg_number_list = 0

        if words_to_consider_again != list() and avg_number_list!=list() and mean_avg_number_list >=0.55:
            df.at[idx, 'category'].update(Counter(words_to_consider_again))

    i+=1

    

In [103]:
df_temp['business_tags'] = df['business_tags']
df['description'] = df_temp['description']
df_temp['category'] = df['category']
df_temp['niche'] = df['niche']
df['most_important_words'] = [Counter(df.at[idx, 'most_important_words']) for idx, _ in df.iterrows()]


In [104]:
all_overall_embeddings2 = np.array(df.apply(lambda x: get_sentence_embeddings_for_categories(x['description']+x['business_tags']+x['niche']+x['category']+x['most_important_words']), axis=1).to_list())

### Filtering Title Description Tags Based on Embedding Similarity

This final step filters out weak title-description words by comparing their embeddings against all available company embeddings.

### Main Steps

- **Embedding Similarity**:
  - Compare `title_description` embeddings to the overall embeddings (`all_overall_embeddings2`) for each company.
  - Use cosine similarity to measure relevance.

- **Thresholding**:
  - Words with maximum similarity below a threshold (0.35) are **deleted** from the `title_description` field.

- **Final Cleaning**:
  - Only title-description words strongly connected to the overall context are retained.

### Goal

Ensure that the `title_description` field contains only highly relevant and semantically aligned words, improving overall quality and consistency.

In [105]:

i=0
for idx, rows in df_temp.iterrows():


    allovce = np.array(all_overall_embeddings2[i], dtype= np.float32, order='C')
    faiss.normalize_L2(allovce)

    te2 = np.array(title_embeddings2[i], dtype= np.float32, order='C')
    faiss.normalize_L2(te2)
    

    val_title = np.einsum('jk,lk->jl', te2, allovce)
    list_title = [bt for bt in rows['title_description']]      
    stay_in_loop = 1
    threshold = 0.35
    copy_df_temp_title = df_temp['title_description'][idx].copy()
    if list_title != []:
        for j in range(len(df_temp['title_description'][idx])):
            mav_val_title = max(val_title[j])
            if (mav_val_title < threshold):
                del copy_df_temp_title[list_title[j]]

    df_temp.at[idx, 'title_description'] = copy_df_temp_title

    i+=1

    

In [106]:
df['description'] = df_temp['description'] + df_temp['title_description']

### Building the Final Combined Feature Column

This step creates a new enriched column combining all important fields for each company, preparing the data for the final model.

### Main Steps

- **Combining Fields**:
  - Merge `description`, `business_tags`, `category`, and `niche` fields together.
  - `category` and `niche` are repeated to emphasize their importance.
  - Add `most_important_words`, giving them extra weight if they aren't already present.

- **Weighting Words**:
  - The combined list is transformed into a `Counter`, keeping track of how many times each word appears.

- **Final Adjustments**:
  - Sector information is appended twice to strengthen its presence.
  - A final cleaned version (`new_col`) is stored as a **set** (for uniqueness).
  - A backup version (`df_old['new_col']`) retains the **full counts** before conversion to set form.

### Goal

To build a rich, weighted feature representation of each company that combines descriptions, tags, important words, and sector information, ready for downstream machine learning or matching tasks.

In [107]:
for idx, _ in df.iterrows():
    df.at[idx, 'new_col'] = df.at[idx, 'description']+df.at[idx, 'business_tags']+ df.at[idx, 'category'] + df.at[idx, 'niche'] + df.at[idx, 'category'] + df.at[idx, 'niche']
    list_to_add = []
    our_current_list = list(df.at[idx, 'new_col'])
    for word in df['most_important_words'][idx]:
        if word not in our_current_list:
            list_to_add.append(word)
            list_to_add.append(word)
        else:
            list_to_add.append(word)
    df.at[idx, 'new_col'].update(Counter(list_to_add))

In [108]:
df['new_col'] =  [list(map(lambda x: (x,row['new_col'][x]), row['new_col']))  for _, row in df.iterrows()]

list_elements = []
for idx, row in df.iterrows():
    list__ = []
    for word in row['new_col']:

        if(word[0].isupper()):
            list_elements.append(word[0])
        b = word[0]
    
        list__+=([b]*word[1])

    list__.append(df_old_copy['sector'][idx])
    list__.append(df_old_copy['sector'][idx])
    df.at[idx, 'new_col2'] = Counter(list__)
    set_List__ = set(list__)
    set_List__.discard('')
    df.at[idx, 'new_col'] = set_List__

In [109]:
df_old = df.copy()
df_old['new_col'] = df['new_col2'].copy()


In [110]:
df_old['count_niche_cat'] = (df['niche']+df['category']).copy()

In [111]:
df['new_col2'] = [Counter(elements_label) for elements_label in df['new_col']]

In [112]:
X_csr, feature_names, tf_idf, X = get_tfidf_sparse_matrix(df['new_col2'])

In [113]:
get_dictionary_for_tfidif(X_csr, feature_names, tf_idf, X)

In [114]:
import faiss
zz_o = 0
list_a = []
high_correlation_scores = {}
for idx, rows in df.iterrows():
    zz_o+=1
    list_new_col = rows['new_col'].copy()
    threshold = 0.30
    continue_iteration = 1    
    word_embedding_list1 = np.array([np.pad(embeddings_index.get(wd.lower(), np.zeros(300)), (0, label_embeddings[0].shape[0] - len(embeddings_index.get(wd.lower(), np.zeros(300)))), mode='constant') for wd in df['new_col'][idx]])
    we1 = np.array(word_embedding_list1, dtype=np.float32, order='C')
    faiss.normalize_L2(we1)

    le1 = np.array(label_embeddings, dtype=np.float32, order='C')
    faiss.normalize_L2(le1)

    el = np.dot(we1, le1.T)

    to_continue_founction = 1
    word_list = [wd for wd in df['new_col'][idx]]
    current_row = rows['new_col'].copy()
    len_current_row_original = len(current_row)
    
    
    while to_continue_founction:
       
        current_row = rows['new_col'].copy()
        len_current_row = len(current_row)

        for index in range(len(rows['new_col'])):
            max_val_word = max(el[index])
            if max_val_word <= threshold:
               
               current_row.remove(word_list[index])
            
        x
        if current_row == Counter() or (len(current_row)<=1 and len_current_row_original!=1):
            threshold-=0.1
        else:
            to_continue_founction=0


    df.at[idx, 'new_col'] = current_row
              

In [115]:
df2['new_col'] = df['new_col'].copy()

### Filtering Low-Similarity Tokens

To reduce noise and improve the relevance of our input, we filter out **tokens that have low similarity to any label**.

This step ensures that only tokens with meaningful connections to the labels are kept, helping the classifier focus on the most important features.

Instead of using `cosine_similarity` (which can be slower for large datasets), we use **`np.dot()`** for computing similarity, since our embeddings are already **L2-normalized**. This gives us the same result as cosine similarity but with **better performance**.

### Token Embedding with TF-IDF Weighting

For each company, we compute **embeddings for its individual tokens**. This step prepares the data for efficient similarity search using **FAISS** later on.

We incorporate **TF-IDF weighting** into the embeddings to emphasize more informative terms. Each token's embedding is scaled according to its TF-IDF weight, using the following formula:  
weighted_embedding = (embedding_term × tf-idf_weight) / sum(tf-idf_weights_per_row)     

This ensures that tokens with higher importance (based on TF-IDF) have a greater influence on the final representation of the company, this improves both accuracy and efficiency in label matching and similarity search.

In [116]:
vec_embeddings=[]
MAX_TOKENS = max(df['new_col'].apply(len))
global rows_
def get_sentence_embeddings(word_list, row):
    embeddings_ = []
    if word_list == Counter():
        embeddings_= np.pad(embeddings_, (0, (400-len(embeddings_))), mode = 'constant')
    else:
        sum_list = sum([match_words_with_tf_idf_valuess[(word, row)] for word in word_list])

        embeddings_ = [(match_words_with_tf_idf_valuess[(word, row)] *
        ( embeddings_index.get(word, np.zeros(300)
        )))/(sum_list)for word in word_list]
        embeddings_ = np.mean(embeddings_, axis=0)
        if (len(embeddings_)) < X.shape[1]:
            embeddings_= np.pad(embeddings_, (0, (400-len(embeddings_))), mode = 'constant')
    
    return embeddings_
vec_embeddings = np.array(df.apply(lambda x: get_sentence_embeddings(x['new_col'], x['rows']), axis=1).to_list())
cat_niche_embeddings = np.array(df.apply(lambda x: get_sentence_embeddings(x['category']+x['niche'], x['rows']), axis=1).to_list())

### Selecting the Strongest Terms for Each Label

To identify the **strongest term for each label**, we compare all label terms against the tokens of **every company** using semantic similarity.

For each label, we select the **term that achieves the highest similarity score** with any company token. This ensures that the chosen term is not just statistically unique, but also **semantically meaningful and relevant**.

#### Why We Changed the Approach:
Previously, we selected the top term based solely on **TF-IDF score**. However, this often led to choosing **rare adjectives** that had high TF-IDF values but were **not representative** of the label's actual meaning.

By switching to a **semantic similarity-based method**, we now capture terms that are **both meaningful and more generalizable** across companies.

In [117]:
def replace_word_with_right_one(core_word):
    if core_word[-2:]=="es" and core_word[:-2] in match_average_words_with_tf_idf_valuess.keys():
        core_word =core_word[:-2]
    elif core_word[-1:]=="s" and core_word[:-1] in match_average_words_with_tf_idf_valuess.keys():
        core_word =core_word[:-1]
    return core_word

### Generating Final Labels with Domain, Modifier, and Core Structure

This step restructures each label by identifying a domain, modifier, and core term to create more consistent, structured class names, which will later be used to boost important terms during classification.

### Main Steps

- **Splitting Labels**:
  - Parse the `refined_labels2` field using NLP to extract:
    - **Domain** = first word,
    - **Core** = last word,
    - **Modifier** = best middle word (chosen by highest TF-IDF score).

- **Handling Special Cases**:
  - Simplified rules are applied for labels with only 2–3 words.
  - Bigrams and trigrams are considered to select the best modifier candidate.

- **Marking Generic Terms**:
  - Generic or very generic domain/core/modifier words are marked (e.g., `__1`, `::1`) to help during boosting.

- **Final Label Construction**:
  - Create `new_label_with_categories` by combining domain, modifier, and core.

- **Later Usage**:
  - During classification, terms extracted from this structured label are **boosted** to increase their influence and improve prediction accuracy.

### Goal

Create a systematic, structured version of each label that can be leveraged during classification to better guide and boost the most meaningful terms.

In [118]:
our_classes['refined_labels'] = our_classes['new_label']
our_classes['refined_labels2'] = our_classes['new_label']

for idx, row in our_classes.iterrows():
    value = row['refined_labels'][-1]

    our_classes.at[idx,'refined_labels2'] = list(filter(lambda x: x if len(x.split("_"))==1 or (x.split("_")[0] not in list(row['new_label']) or  x.split("_")[1] not in list(row['new_label'])) else "", row['new_label']))
    if our_classes.at[idx, 'refined_labels2'] == []:
        our_classes.at[idx, 'refined_labels2'].append(value)


In [119]:
core = [""] * len(our_classes['new_label2'])
modifier = [""] * len(our_classes['new_label'])
domain = [""] * len(our_classes['new_label'])

for i in range(len(our_classes['refined_labels2'])):

    doc = nlp(" ".join(our_classes.at[i, 'refined_labels2']))

    list_elements = []
    for token in doc:
        list_elements.append(token.text)
    if len(doc) > 3:
        domain[i] = doc[0].text
        core[i] = doc[-1].text
    
        list_elements = []
        for token in doc[1:-1]:
            list_elements.append(token.text)
        best_value_term = -1
        best_word = ""
        list_elements.extend(new_ngrams(list_elements, 2))
        list_elements.extend(new_ngrams(list_elements, 3))
        for word in list_elements:
            if best_value_term < (match_average_words_with_tf_idf_valuess[word]/match_number_words_with_tf_idf_valuess[word]):
                best_value_term = (match_average_words_with_tf_idf_valuess[word]/match_number_words_with_tf_idf_valuess[word])
                best_word = word
        modifier[i] = best_word
    elif len(doc) == 3:

        list_elements = []
        for j in doc:
            list_elements.append(j.text)
        
        domain[i] = list_elements[0]
        modifier[i] = list_elements[1]
        core[i] = list_elements[2]
    elif len(doc)==2:
        list_elemnts = [el.text for el in doc[0:2]]
        domain[i] = doc[0].text
        core[i] = doc[1].text
    else:
        
        domain[i] = doc[0].text


In [120]:
very_generic_core_terms = {
    "promotion",
    'production',
    "services",
    "service"
}

In [121]:

for i in range(len(core)):
    if core[i] in generic_core_terms:
        core[i] = core[i]+"__1"
    if core[i] in very_generic_core_terms:
        core[i] = core[i]+"::1"
    if domain[i] in generic_domain_terms:
        domain[i] = domain[i]+"__1"
    if modifier[i] == "":
        modifier[i] = "0"
    if core[i] == "":
        core[i] = "0"

In [122]:
for i in range(len(core)):
    our_classes.at[i,'new_label_with_categories'] = f"{domain[i]} {modifier[i]} {core[i]}"


### Saving Relevant Data to a New File

The Python program was getting too long and slow to run all at once.  
To make things easier and faster, we saved only the **important data** we found into **separate files**.

This helped us keep the project organized and continue working in smaller, manageable steps.

In [123]:
df['niche_plus_cat'] = df['niche']

for idx, rows in df.iterrows():
    df.at[idx, 'niche_plus_cat'] = list(df.at[idx,'niche']+df.at[idx, 'category'])
    df.at[idx, 'sector'] = df.at[idx, 'sector'].replace(" ", "_").lower()

In [124]:
for idx, _ in df_old.iterrows():
    df_old.at[idx, 'new_col']= {k:v for k,v in df_old.at[idx,'new_col'].items() if k!=''}

In [125]:
for idx, i in enumerate(df2['description']):
    if i=='':
        similarity_matrix[idx] = (similarity_matrix_label2[idx] + similarity_matrix_label3[idx])/2


In [126]:
np.savetxt("vec_embeddings.txt", vec_embeddings, fmt="%.12f")
np.savetxt("label_embeddings.txt", label_embeddings, fmt="%.12f")
np.savetxt("cat_niche_embeddings.txt", cat_niche_embeddings, fmt="%.12f")
np.savetxt("context_matrix.txt", similarity_matrix, fmt="%.12f")
np.savetxt("niche_matrix.txt", similarity_matrix_label2, fmt="%.12f")
np.savetxt("category_matrix.txt", similarity_matrix_label3, fmt="%.12f")
np.savetxt("similarity_desc_to_niche.txt", similarity_matrix2, fmt="%.12f")
np.savetxt("similarity_desc_to_category.txt", similarity_matrix3, fmt="%.12f")
np.savetxt("first_sentence_matrix_total1.txt", similarity_matrix_first_sentence, fmt="%.12f")

df['new_col'].to_csv('tokens_vector.csv')
df_old['new_col'].to_csv('counted_elements.csv')
df_old['count_niche_cat'].to_csv('counted_categories.csv')
df['sector'].to_csv('sectors.csv')
df['niche_plus_cat'].to_csv('our_current_categories.csv')
our_classes['new_label_with_categories'].to_csv('new_label_with_categories.csv')

In [127]:
ddd=0
for idx, row in enumerate(df['new_col']):
    max_v1=-1.0

    for wd in row:
        
        if max_v1 < match_words_with_tf_idf_valuess[(wd, ddd)]:
            max_v1 = match_words_with_tf_idf_valuess[(wd, ddd)]
    
    ddd+=1

In [128]:
fout = "match_words_with_tf_idf.txt"
fo = open(fout, "w")

for k, v in match_words_with_tf_idf_valuess.items():
    fo.write(str(k[0])+" "+str(k[1]) + ' '+ str(v) + '\n')

fo.close()

In [129]:
fout = "most_important_words.txt"
fo = open(fout, "w")

for idx, _ in df.iterrows():
    list_cur = df['most_important_words'][idx]+df['title_description'][idx]
    for i in list(list_cur):
        fo.write(i)
        fo.write(" ")
    fo.write("\n")
    

fo.close()

In [130]:
fout = "average_tf_idf_per_word.txt"
fo = open(fout, "w")

for k, v in match_average_words_with_tf_idf_valuess2.items():
    fo.write(str(k)+ ' '+ str(v/(match_number_words_with_tf_idf_valuess2[k])) + '\n')

fo.close()

In [131]:
fout = "correlated_terms.txt"
fo = open(fout, "w")

for k in correlated_terms:
    fo.write(str(k[0])+" "+str(k[1])+'\n')

fo.close()