# 3. Deduplication

## 3.1 Import the required libaries

In [1]:
from unidecode import unidecode
from deduplipy.deduplicator import Deduplicator
import pandas as pd
import numpy as np
import datetime
import dedupe
import pickle
import uuid
import re
import os

# Define a namespace (this can be any UUID)
namespace = uuid.NAMESPACE_DNS

## 3.2 Load Data

### Mandatory input files

In [2]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    pass
else:
    translated_file_location ="../../data/example_data/output/tilt_products_translated.csv"
    tilt_products_deduped =  "../../data/example_data/output/tilt_products_deduped.csv"
    # Read the data into a DataFrame
    translated_df = pd.read_csv(translated_file_location).dropna(subset=['products_and_services'])
    # drop first column
    translated_df = translated_df.drop(translated_df.columns[0], axis=1)

In [3]:
# Use uuid5 to generate a UUID based on the product name
translated_df['new_product_id'] = translated_df['products_and_services'].apply(lambda name: uuid.uuid5(namespace, name))

In [4]:
translated_df

Unnamed: 0,products_id,raw_products_and_services,products_and_services,new_product_id
0,1841bbb0-07cc-524a-a696-0f1aac447089,agricultural production,agricultural production,1841bbb0-07cc-524a-a696-0f1aac447089
1,b5570a99-7759-5c83-955a-b36b37e84eed,water absorption plants,water absorption plants,b5570a99-7759-5c83-955a-b36b37e84eed
2,c89d7884-9568-55d1-a79f-9030cf7c7677,moisture absorption,moisture absorption,c89d7884-9568-55d1-a79f-9030cf7c7677
3,f78816f2-b354-59d4-8175-59a2c6500f9a,accessories for watering systems,accessories for watering systems,f78816f2-b354-59d4-8175-59a2c6500f9a
4,b06ae426-94d7-5186-90f8-99c70c0c09b3,agricultural products,agricultural products,b06ae426-94d7-5186-90f8-99c70c0c09b3
...,...,...,...,...
54519,8cd89ead-558e-5b13-9876-c6585d4245a6,dental veneers,dental veneers,8cd89ead-558e-5b13-9876-c6585d4245a6
54520,97803d28-f232-5cf8-8873-9ed8a1fe9a2b,invisalign,invisalign,97803d28-f232-5cf8-8873-9ed8a1fe9a2b
54521,bfeb0cb1-b4ef-5b43-83a8-14b5aaf91a11,sheet veneer,sheet veneer,bfeb0cb1-b4ef-5b43-83a8-14b5aaf91a11
54522,1084e16f-a815-5075-bdfb-d5d8acabe2e9,invisible braces,invisible braces,1084e16f-a815-5075-bdfb-d5d8acabe2e9


In [5]:
company_products_mapping = pd.read_csv("../../data/example_data/output/tilt_company_products.csv", index_col=0)

In [6]:
company_product_complete = company_products_mapping.merge(translated_df, on="products_id", how='left')

In [7]:
company_products_mapping = company_product_complete[["id", "new_product_id"]].drop_duplicates().rename(columns={"new_product_id": "products_id"})

In [8]:
# drop_products_id and rename new_product_id to products_id
translated_df.drop("products_id", axis=1, inplace=True)
translated_df.rename(columns={"new_product_id": "products_id"}, inplace=True)

# only take the columns we want to keep
translated_df = translated_df[['products_id', 'products_and_services']].drop_duplicates()

In [29]:
translated_df

Unnamed: 0,products_id,products_and_services
0,1841bbb0-07cc-524a-a696-0f1aac447089,agricultural production
1,b5570a99-7759-5c83-955a-b36b37e84eed,water absorption plants
2,c89d7884-9568-55d1-a79f-9030cf7c7677,moisture absorption
3,f78816f2-b354-59d4-8175-59a2c6500f9a,accessories for watering systems
4,b06ae426-94d7-5186-90f8-99c70c0c09b3,agricultural products
...,...,...
54519,8cd89ead-558e-5b13-9876-c6585d4245a6,dental veneers
54520,97803d28-f232-5cf8-8873-9ed8a1fe9a2b,invisalign
54521,bfeb0cb1-b4ef-5b43-83a8-14b5aaf91a11,sheet veneer
54522,1084e16f-a815-5075-bdfb-d5d8acabe2e9,invisible braces


## 3.3 Deduplication

In [None]:
myDedupliPy = Deduplicator(col_names=['products_and_services'], interaction=True, verbose=1)
myDedupliPy.fit(translated_df)

In [25]:
with open('myDeduplipy.pkl', 'wb') as f:
    pickle.dump(myDedupliPy, f)

In [9]:
with open('myDeduplipy.pkl', 'rb') as f:
    pre_trained_deduplipy = pickle.load(f)

In [10]:
deduplipy_result = pre_trained_deduplipy.predict(translated_df, score_threshold=0.95)

blocking started
blocking finished
Nr of pairs: 22201845
scoring started
scoring finished
Nr of filtered pairs: 98
Clustering started
Clustering finished


In [11]:
deduplipy_result

Unnamed: 0,products_and_services,deduplication_id
0,agricultural production,184
1,water absorption plants,185
2,moisture absorption,186
3,accessories for watering systems,187
4,agricultural products,188
...,...,...
53592,dental veneers,53592
53593,invisalign,53593
53594,sheet veneer,53594
53595,invisible braces,53595


In [33]:
combined_df = deduplipy_result.copy().merge(translated_df, on='products_and_services', how='left')

In [34]:
combined_df

Unnamed: 0,products_and_services,deduplication_id,products_id
0,agricultural production,184,1841bbb0-07cc-524a-a696-0f1aac447089
1,water absorption plants,185,b5570a99-7759-5c83-955a-b36b37e84eed
2,moisture absorption,186,c89d7884-9568-55d1-a79f-9030cf7c7677
3,accessories for watering systems,187,f78816f2-b354-59d4-8175-59a2c6500f9a
4,agricultural products,188,b06ae426-94d7-5186-90f8-99c70c0c09b3
...,...,...,...
53592,dental veneers,53592,8cd89ead-558e-5b13-9876-c6585d4245a6
53593,invisalign,53593,97803d28-f232-5cf8-8873-9ed8a1fe9a2b
53594,sheet veneer,53594,bfeb0cb1-b4ef-5b43-83a8-14b5aaf91a11
53595,invisible braces,53595,1084e16f-a815-5075-bdfb-d5d8acabe2e9


In [35]:
# take the first product_id for each group and assign it as a new column to the test dataframe
combined_df.loc[:, "prime_products_id"] = combined_df.groupby("deduplication_id").products_id.transform(lambda g: g.iloc[0])

In [36]:
company_products_mapping

Unnamed: 0,id,products_id
0,agrobiogel-gmbh_00000005421207-757715001,1841bbb0-07cc-524a-a696-0f1aac447089
1,agrobiogel-gmbh_00000005421207-757715001,b5570a99-7759-5c83-955a-b36b37e84eed
2,agrobiogel-gmbh_00000005421207-757715001,c89d7884-9568-55d1-a79f-9030cf7c7677
3,agrobiogel-gmbh_00000005421207-757715001,f78816f2-b354-59d4-8175-59a2c6500f9a
4,agrobiogel-gmbh_00000005421207-757715001,a3d5a68b-912a-5bdf-bd38-dd42e23cac11
...,...,...
2127426,castlewood-floors_00000004567199-400676001,7bc7c13e-a741-52f1-90fe-e8acb2ae7453
2127427,castlewood-floors_00000004567199-400676001,2b8cf3dc-99c9-52b3-9314-c2f4cf16634a
2127428,floorsave_00000003927703-208335001,bfabacaf-d3b4-5874-8812-2664e3f91a82
2127429,floorsave_00000003927703-208335001,2c94f677-3b36-5b1d-803a-f8775c8b7257


In [37]:
full_listing = combined_df.merge(company_products_mapping, on="products_id", how="right")

In [38]:
company_products_mapping = full_listing[["id", "prime_products_id"]].drop_duplicates().rename(columns={"prime_products_id": "products_id"})

In [39]:
company_products_mapping

Unnamed: 0,id,products_id
0,agrobiogel-gmbh_00000005421207-757715001,1841bbb0-07cc-524a-a696-0f1aac447089
1,agrobiogel-gmbh_00000005421207-757715001,b5570a99-7759-5c83-955a-b36b37e84eed
2,agrobiogel-gmbh_00000005421207-757715001,c89d7884-9568-55d1-a79f-9030cf7c7677
3,agrobiogel-gmbh_00000005421207-757715001,f78816f2-b354-59d4-8175-59a2c6500f9a
4,agrobiogel-gmbh_00000005421207-757715001,a3d5a68b-912a-5bdf-bd38-dd42e23cac11
...,...,...
2127398,castlewood-floors_00000004567199-400676001,7bc7c13e-a741-52f1-90fe-e8acb2ae7453
2127399,castlewood-floors_00000004567199-400676001,2b8cf3dc-99c9-52b3-9314-c2f4cf16634a
2127400,floorsave_00000003927703-208335001,bfabacaf-d3b4-5874-8812-2664e3f91a82
2127401,floorsave_00000003927703-208335001,2c94f677-3b36-5b1d-803a-f8775c8b7257


In [41]:
deduplicated_products = test[["prime_products_id", "products_and_services"]].rename(columns={"prime_products_id": "products_id"}).drop_duplicates(subset=["products_id"])
deduplicated_products

Unnamed: 0,products_id,products_and_services
0,1841bbb0-07cc-524a-a696-0f1aac447089,agricultural production
1,b5570a99-7759-5c83-955a-b36b37e84eed,water absorption plants
2,c89d7884-9568-55d1-a79f-9030cf7c7677,moisture absorption
3,f78816f2-b354-59d4-8175-59a2c6500f9a,accessories for watering systems
4,b06ae426-94d7-5186-90f8-99c70c0c09b3,agricultural products
...,...,...
53592,8cd89ead-558e-5b13-9876-c6585d4245a6,dental veneers
53593,97803d28-f232-5cf8-8873-9ed8a1fe9a2b,invisalign
53594,bfeb0cb1-b4ef-5b43-83a8-14b5aaf91a11,sheet veneer
53595,1084e16f-a815-5075-bdfb-d5d8acabe2e9,invisible braces


In [27]:
company_products_mapping.to_csv("../../data/example_data/output/tilt_company_products_mapping_deduped.csv")

In [28]:
deduplicated_products.to_csv("../../data/example_data/output/tilt_products_deduped.csv")