In [36]:
import pandas as pd
import os
import spacy
import networkx as nx
from sklearn.metrics.cluster import silhouette_score
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [4]:
# Determine the location of the dataframe containing the typo-corrected text
file_location = "../data/example_data/output/1k_subset/translated_products.parquet"

# Read the dataframe
full_df = pd.read_parquet(file_location)

# Display dataframe
display(full_df)

Unnamed: 0,products_id,products_and_services,language (ISO-code),typo_corrected,translated_text
0,164399edbf8e880dc2e856f50d51e720bd0a8abe,"fish, frozen and deep-frozen",en,"fish, frozen and deep-frozen","fish, frozen and deep-frozen"
1,b0d3c55743b1b858ec2843c8870116bb8af543fd,drilling and test boring - equipment,en,drilling and test boring - equipment,drilling and test boring - equipment
2,b14c038972e6a52bfbf3ffbe77def57a62c5b9cf,well-management services,nl,well-management services,well-management services
3,abadc2542b4b5c1ecfe41c22afb2347b1d9b65af,electronic data processing - software,en,electronic data processing - software,electronic data processing - software
4,60c58ad2ef34d96fae028f1039fab03dec9eb9a2,communication,it,communication,communication
...,...,...,...,...,...
995,ecfa87628d5e5249db7730bf2bed09f8c4419dc9,roofing materials,en,looking materials,roofing materials
996,1e15564c0be3b13bb11516b0499fe75d8d7f976b,"energy conservation, consultants",en,"energy conservation, consultant","energy conservation, consultants"
997,eb4f959622941f4dd9b40355669cba97858ba66b,infrared heaters,en,inflamed theaters,infrared heaters
998,08ea29fcbe3c2f14f62b96e5af2f7e838fff0773,bakery and confectionery industry - machinery ...,en,baker and confectioner industry - machinery an...,bakery and confectionery industry - machinery ...


In [6]:
documents = full_df["translated_text"].to_list()

In [8]:
nlp = spacy.load('en_core_web_lg')

def lemmatize(txt):
    lemmatised_list = [token.lemma_.lower() for token in nlp(txt) if not (token.is_stop or token.is_punct)]
    return(lemmatised_list)

In [9]:
texts = [[text for text in lemmatize(doc)] for doc in documents]

In [10]:
from gensim import corpora
from gensim.similarities import MatrixSimilarity
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(docString) for docString in texts]

In [41]:
corpus

[[(0, 1), (1, 1), (2, 2)],
 [(3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1)],
 [(14, 1), (15, 1), (16, 1)],
 [(17, 1), (18, 1), (19, 1), (20, 1)],
 [(19, 1), (20, 1), (21, 1)],
 [(19, 1), (20, 1), (22, 1), (23, 1), (24, 1)],
 [(25, 1), (26, 1), (27, 1), (28, 1)],
 [(27, 1), (28, 1), (29, 1)],
 [(30, 1), (31, 1), (32, 1), (33, 1)],
 [(34, 1)],
 [(35, 1), (36, 1), (37, 1)],
 [(38, 1), (39, 1), (40, 1), (41, 1)],
 [(42, 1)],
 [(43, 1), (44, 1)],
 [(45, 1), (46, 1), (47, 1)],
 [(8, 1), (48, 1), (49, 1)],
 [(50, 1), (51, 1)],
 [(52, 1), (53, 1)],
 [(14, 1), (53, 1)],
 [(54, 1), (55, 1)],
 [(51, 1), (53, 1), (56, 1)],
 [(57, 1), (58, 1)],
 [(59, 1), (60, 1)],
 [(59, 1), (61, 1)],
 [(62, 1), (63, 1)],
 [(64, 1), (65, 1)],
 [(66, 1), (67, 1)],
 [(68, 1), (69, 1), (70, 1)],
 [(35, 1), (70, 1)],
 [(71, 1), (72, 1)],
 [(5, 1), (73, 1), (74, 1), (75, 1)],
 [(5, 1), (33, 1), (35, 1), (76, 1), (77, 1)],
 [(78, 1), (79, 1), (80, 1), (81, 1)],
 [(5,

In [11]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x246983365c0>

In [12]:
n_best = 20
index = MatrixSimilarity(corpus=corpus,
                   num_features=len(dictionary),
                   num_best = n_best)

In [39]:
index.

<gensim.similarities.docsim.MatrixSimilarity at 0x246957cfbb0>

In [13]:
doc_id = 0
similar_docs = {}

for similarities in index:
    similar_docs[doc_id] = list(enumerate(similarities))
    doc_id += 1
    

In [14]:
row_original_list = []
row_match_list = []
similarity_list = []

for doc_id, sim_doc_tuple_list in similar_docs.items():
  for sim_doc_tuple in sim_doc_tuple_list:
    
     row_match = sim_doc_tuple[1][0]
     similarity = sim_doc_tuple[1][1]
     
     row_original_list.append(doc_id)
     row_match_list.append(row_match)
     similarity_list.append(similarity)
     

In [15]:
df_temp = pd.DataFrame({"row_original": row_original_list, "row_match": row_match_list, "similarity": similarity_list})
df_temp  = df_temp[df_temp["row_original"] != df_temp["row_match"]]

In [16]:
lookup = full_df[["translated_text"]]

In [17]:
lookup

Unnamed: 0,translated_text
0,"fish, frozen and deep-frozen"
1,drilling and test boring - equipment
2,well-management services
3,electronic data processing - software
4,communication
...,...
995,roofing materials
996,"energy conservation, consultants"
997,infrared heaters
998,bakery and confectionery industry - machinery ...


In [18]:
df_temp = df_temp.merge(lookup, how = "inner", left_on = "row_original", right_index = True)
df_temp = df_temp.merge(lookup, how = "inner", left_on = "row_match", right_index = True)

In [19]:
df_temp

Unnamed: 0,row_original,row_match,similarity,translated_text_x,translated_text_y
1,0,887,0.866025,"fish, frozen and deep-frozen",frozen fish
1534,114,887,0.707107,fish,frozen fish
1877,143,887,0.577350,frozen and deep-frozen foods,frozen fish
3584,273,887,0.577350,"meats, frozen and deep-frozen",frozen fish
11626,901,887,0.353553,"fish, seafood and snails, preserved",frozen fish
...,...,...,...,...,...
12547,979,980,0.408248,antivibration mats,anti-slip mats
12621,988,980,0.577350,mats,anti-slip mats
12550,980,979,0.408248,anti-slip mats,antivibration mats
12620,988,979,0.707107,mats,antivibration mats


In [23]:
cluster_threshold = 10

df_continue = df_temp
df_accepted = pd.DataFrame(columns = ["cluster_id", "id", "sim_threshold", "cluster_threshold"])

sim_threshold = 0.5
cluster_id = 0

In [24]:
while sim_threshold < 1.01 and len(df_continue.index) > 0:
  
  print("Starting new round with a similarity threshold above", sim_threshold, ".")
 
  df_continue_temp = df_continue[df_continue["similarity"] > sim_threshold]
  print("-- Inferring clusters from", len(df_continue_temp.index), "rows.")
  duplicate_tuples_list = list(zip(df_continue_temp.row_original, df_continue_temp.row_match))
  G = nx.Graph()
  G.add_edges_from(duplicate_tuples_list)
  print("-- Created connected components.")
  cluster_list = [connected_component for connected_component in nx.connected_components(G)]
  clusters_list = []
  ids_list = []
  
  for cluster in cluster_list:
    for ids in cluster:
      clusters_list.append(cluster_id)
      ids_list.append(ids)
    cluster_id += 1
  
  df_w_clusters = pd.DataFrame({'cluster_id': clusters_list, 'id': ids_list})
  df_w_clusters["sim_threshold"] = sim_threshold
  df_w_clusters["cluster_threshold"] = cluster_threshold
  
  print("-- Identified", len(set(df_w_clusters["cluster_id"])), "clusters in this iteration.")
   
  cluster_size = df_w_clusters.groupby(['cluster_id']).size().reset_index(name='counts')
  big_cluster_ids = cluster_size[cluster_size["counts"] > cluster_threshold]["cluster_id"]
  print("-- From these clusters", len(set(big_cluster_ids)), "have a size higher than the cluster threshold", cluster_threshold,".")
  
  df_accepted_temp = df_w_clusters[~df_w_clusters["cluster_id"].isin(big_cluster_ids)]
  print("-- Accepted", len(df_accepted_temp.index), "new clustered rows in", len(set(df_accepted_temp["cluster_id"])), "clusters in this iteration.")
  
  df_accepted = pd.concat([df_accepted, df_accepted_temp], verify_integrity = True, ignore_index = True)
  print("-- This makes a temporary total of", len(df_accepted.index), "rows in", len(set(df_accepted["cluster_id"])), "clusters in this iteration.")
  
  ids_continue = df_w_clusters[df_w_clusters["cluster_id"].isin(big_cluster_ids)]["id"]
  df_continue = df_continue_temp[df_continue_temp["row_original"].isin(ids_continue) | df_continue_temp["row_match"].isin(ids_continue)]
  print("-- Continuing with", len(df_continue.index), "rows for which cluster sizes where above cluster threshold", cluster_threshold, "when using a minimal similarity of", sim_threshold,".")
  
  cluster_id += 1
  print("-- Using cluster id", cluster_id, "in the next round.")
  
  sim_threshold += 0.05
  print("-- Similarity threshold increased to", sim_threshold, ".")
#> Starting new round with a similarity threshold above 0.5 .
#> -- Inferring clusters from 10 rows.
#> -- Created connected components.
#> -- Identified 4 clusters in this iteration.
#> -- From these clusters 0 have a size higher than the cluster threshold 10 .
#> -- Accepted 9 new clustered rows in 4 clusters in this iteration.
#> -- This makes a temporary total of 9 rows in 4 clusters in this iteration.
#> -- Continuing with 0 rows for which cluster sizes where above cluster threshold 10 when using a minimal similarity of 0.5 .
#> -- Using cluster id 5 in the next round.
#> -- Similarity threshold increased to 0.55 .

Starting new round with a similarity threshold above 0.5 .
-- Inferring clusters from 1951 rows.
-- Created connected components.
-- Identified 77 clusters in this iteration.
-- From these clusters 3 have a size higher than the cluster threshold 10 .
-- Accepted 209 new clustered rows in 74 clusters in this iteration.
-- This makes a temporary total of 209 rows in 74 clusters in this iteration.
-- Continuing with 1631 rows for which cluster sizes where above cluster threshold 10 when using a minimal similarity of 0.5 .
-- Using cluster id 78 in the next round.
-- Similarity threshold increased to 0.55 .
Starting new round with a similarity threshold above 0.55 .
-- Inferring clusters from 1428 rows.
-- Created connected components.
-- Identified 15 clusters in this iteration.
-- From these clusters 8 have a size higher than the cluster threshold 10 .
-- Accepted 21 new clustered rows in 7 clusters in this iteration.
-- This makes a temporary total of 230 rows in 81 clusters in this ite

In [27]:
df_accepted

Unnamed: 0,cluster_id,id,sim_threshold,cluster_threshold,translated_text_x,products_id_x,translated_text_y,products_id_y
0,0,0,0.50,10,"fish, frozen and deep-frozen",164399edbf8e880dc2e856f50d51e720bd0a8abe,"fish, frozen and deep-frozen",164399edbf8e880dc2e856f50d51e720bd0a8abe
1,0,143,0.50,10,frozen and deep-frozen foods,acb528300eeaed4754de62290cd90abce083f815,frozen and deep-frozen foods,acb528300eeaed4754de62290cd90abce083f815
2,0,273,0.50,10,"meats, frozen and deep-frozen",8a96bec838c62b2fcc32ab96ec79819dc8eae0ba,"meats, frozen and deep-frozen",8a96bec838c62b2fcc32ab96ec79819dc8eae0ba
3,0,114,0.50,10,fish,64875fcccaac069fcb3e0e201e7d5b9166641608,fish,64875fcccaac069fcb3e0e201e7d5b9166641608
4,0,887,0.50,10,frozen fish,6b44506820dfed44bcb8beb00d2c614e7be73070,frozen fish,6b44506820dfed44bcb8beb00d2c614e7be73070
...,...,...,...,...,...,...,...,...
404,146,884,0.75,10,stainless steel,9ffdf6f70bed29d2d7e786b6222c0ae8599eed25,stainless steel,9ffdf6f70bed29d2d7e786b6222c0ae8599eed25
405,150,67,0.85,10,forming - steels and metals,3d9f448965b55e5d68cabe4429ae1100b6d41383,forming - steels and metals,3d9f448965b55e5d68cabe4429ae1100b6d41383
406,150,6,0.85,10,steels and metals - forming and cutting,d379f2ced32e7806893e4232fdbd0f9544146ae8,steels and metals - forming and cutting,d379f2ced32e7806893e4232fdbd0f9544146ae8
407,151,936,0.85,10,screw cutting - steels and metals,acdf7d1952cf9a95ff2500e6b90cab8dd724a170,screw cutting - steels and metals,acdf7d1952cf9a95ff2500e6b90cab8dd724a170


In [28]:
lookup = full_df[["translated_text", "products_id"]]

df_accepted = df_accepted.merge(lookup, how = "inner", left_on = "id", right_index = True)

df_accepted_w_clusters = df_accepted[["products_id", "translated_text", "cluster_id", "sim_threshold", "cluster_threshold"]]

In [30]:
df_accepted_w_clusters = df_accepted_w_clusters.sort_values(by = ["cluster_id", "translated_text"])

In [32]:
df_accepted_w_clusters

Unnamed: 0,products_id,translated_text,cluster_id,sim_threshold,cluster_threshold
3,64875fcccaac069fcb3e0e201e7d5b9166641608,fish,0,0.50,10
0,164399edbf8e880dc2e856f50d51e720bd0a8abe,"fish, frozen and deep-frozen",0,0.50,10
1,acb528300eeaed4754de62290cd90abce083f815,frozen and deep-frozen foods,0,0.50,10
4,6b44506820dfed44bcb8beb00d2c614e7be73070,frozen fish,0,0.50,10
2,8a96bec838c62b2fcc32ab96ec79819dc8eae0ba,"meats, frozen and deep-frozen",0,0.50,10
...,...,...,...,...,...
403,690befac258d46ab0c47fca50216ad50bd48c22d,stainless steels,146,0.75,10
405,3d9f448965b55e5d68cabe4429ae1100b6d41383,forming - steels and metals,150,0.85,10
406,d379f2ced32e7806893e4232fdbd0f9544146ae8,steels and metals - forming and cutting,150,0.85,10
408,8a5e46a4e4dde6021faefdb40ef635ad26b75129,cutting - steels and metals,151,0.85,10


In [37]:
x = df_accepted_w_clusters.merge(full_df, how = "inner", left_on = "products_id", right_on = "products_id").drop(columns=["sim_threshold", "cluster_threshold", "products_and_services", "typo_corrected", "translated_text_y"])

In [38]:
x

Unnamed: 0,products_id,translated_text_x,cluster_id,language (ISO-code)
0,64875fcccaac069fcb3e0e201e7d5b9166641608,fish,0,sq
1,164399edbf8e880dc2e856f50d51e720bd0a8abe,"fish, frozen and deep-frozen",0,en
2,acb528300eeaed4754de62290cd90abce083f815,frozen and deep-frozen foods,0,en
3,6b44506820dfed44bcb8beb00d2c614e7be73070,frozen fish,0,en
4,8a96bec838c62b2fcc32ab96ec79819dc8eae0ba,"meats, frozen and deep-frozen",0,en
...,...,...,...,...
404,690befac258d46ab0c47fca50216ad50bd48c22d,stainless steels,146,fi
405,3d9f448965b55e5d68cabe4429ae1100b6d41383,forming - steels and metals,150,da
406,d379f2ced32e7806893e4232fdbd0f9544146ae8,steels and metals - forming and cutting,150,en
407,8a5e46a4e4dde6021faefdb40ef635ad26b75129,cutting - steels and metals,151,en
