# 3. Deduplication

## 3.1 Import the required libaries

In [15]:
from unidecode import unidecode
import pandas as pd
import datetime
import dedupe
import re
import os

## 3.2 Load Data

### Mandatory input files

In [16]:
ep_catalogue= "../../data/example_data/input/scraped_data/scraped_EP_products_catalogue.csv"
dedup_settings_file = '../../dedupe_files/dedup_learned_settings'
dedup_training_file = '../../dedupe_files/dedup_training.json'
rl_settings_file = '../../dedupe_files/record_linkage_learned_settings'
rl_training_file = '../../dedupe_files/record_linkage_training.json'

### Base Data

In [17]:
base_data_file_location ="../../data/example_data/output/base_data/base_translated_products.csv"
base_data_deduped_file =  "../../data/example_data/output/base_data/base_deduped.csv"
base_data_output_file = '../../data/example_data/output/base_data/base_linked_data.csv'

### new Italy Data

In [27]:
italy_data_file_location = "../../data/example_data/output/italy_data/italy_translated_products.csv"
italy_deduped_file = "../../data/example_data/output/italy_data/italy_deduped.csv"
italy_output_file = '../../data/example_data/output/italy_data/italy_linked_data.csv'

## 3.3 Deduplication

### Helper functions

In [19]:
def preProcess(column):
    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column

In [20]:
def convert_pandas_to_dict(dataframe, which = "None", type = "dedup"):
    data_d = {}
    for i, row in dataframe.iterrows():
        x = zip(row.index, row.values)
        clean_row = dict([(k, preProcess(str(v))) for (k, v) in x])
        if type != "dedup":
            data_d[which + str(i)] = clean_row
        else:
            data_d[i] = dict(clean_row)
    return data_d

In [21]:
def seconds_conversion(seconds):
    # Convert the time difference to a timedelta object
    time_delta = datetime.timedelta(seconds=seconds)

    # Extract the hours, minutes, and seconds from the timedelta object
    hours = time_delta.seconds // 3600
    minutes = (time_delta.seconds % 3600) // 60
    seconds = time_delta.seconds % 60
    return (hours, minutes, seconds)

### Dedupe modules

#### Deduplication module

In [22]:
def deduplication(file, settings, training, write = False, out = "None"):
   """
   This function deduplicates the dataframe using the dedupe library.

   Args:
       file (str or pd.Dataframe): The path to the file to be deduplicated or a pandas dataframe.
       settings (str): The path to the settings file.
       training (str): The path to the training file.
       write (bool): Indicates whether to write the deduplicated output to file.
       out (str): The path to the output file.
    Returns:
        df (pd.DataFrame): The deduplicated dataframe.
   """
   # Read the csv files
   print('Importing data ...')
   if isinstance(file, str):
       df = pd.read_csv(file)
   else:
       df = file

   # stage 1: Deduplication using dedupe library
   print("----Start of stage 1----")
   print('Preparing dedupe data ...')
   dedup_data = convert_pandas_to_dict(df, "dedup")
   if os.path.exists(settings):
      print('Settings file found! Reading settings from "{}"'.format(settings))
      with open(settings, 'rb') as sf:
         deduper = dedupe.StaticDedupe(sf)
   # If no settings file exists, create train a new linker object
   else:
      # Define the fields that will be used for the record linkage
      fields = [
               {'field': 'products_and_services', 'type': 'String'}] # consider Text type instead of String
      
      # Create a new linker object and pass the fields to it
      deduper = dedupe.Dedupe(fields)
      print("Preparing training...")
      if os.path.exists(training):
         print('Reading labeled examples from ', training)
         with open(training) as tf:
               deduper.prepare_training(dedup_data,
                                       training_file=tf)
      else:
         # Prepare the linker object for training using the two datasets
         deduper.prepare_training(dedup_data)
      # Start the active labeling
      print('Starting active labeling...')
      dedupe.console_label(deduper)
      # Train the linker object using the active labeling as additional input
      print("Training...")
      deduper.train()
      print("Training finished!")
      # write the labelled training examples to disk
      with open(training, 'w') as tf:
         deduper.write_training(tf)
      # write the settings file to disk
      with open(settings, 'wb') as sf:
         deduper.write_settings(sf)

   print('Clustering..')
   clustered_dupes = deduper.partition(dedup_data, 0.5)
   print('Clustering finished!. {} duplicates found'.format(len(df)-len(clustered_dupes)))

   print('Dropping duplicates...')
   rows_to_drop = []
   for _, (records, scores) in enumerate(clustered_dupes):
      rows_to_drop.append(records[1:])

   # flatten list of lists
   rows_to_drop = [item for sublist in rows_to_drop for item in sublist]
   df = df.drop(df.index[rows_to_drop])
   
   print ("Duplicates dropped!")
   print("----Finished stage 1----")
   # reorder the columns
   df = df[['products_id', 'manual_processed_products_and_services', 'automatic_processed_products_and_services']]

   if write: 
      # rename product_and_services column to automated_processed_products_and_services
      df = df.rename(columns={'products_and_services': 'automatic_processed_products_and_services'}).drop(columns=['Unnamed: 0'])
      # if write_path string contains the word base then we are also merging it with the manual clustered data
      if 'base' in out:
         # read in manual clustered datax 
         manual_clustered_df = pd.read_csv('../../data/example_data/input/manual_clustering.csv')
         # merge the two dataframes on products_id
         df = pd.merge(manual_clustered_df, df, on='products_id')
         # drop cluster_id, clustered_delimited_id,delimited_id
         df = df.drop(['clustered_id', 'clustered_delimited_id', 'delimited_id', 'delimited_products_id'], axis=1).rename(columns = {"clustered": "manual_processed_products_and_services"})
         # reorder columns
         df = df[['products_id','raw_products_and_services', 'manual_processed_products_and_services', 'automatic_processed_products_and_services']]
      print('Writing deduplicated output to file...')
      df.to_csv(out, index=False)
      
   return df

#### Record Linkage module

In [23]:
# def record_linkage(left_df, right_df, settings, training, write = False, out = "None"):
#     """
#     This function performs record linkage on the two dataframes using the dedupe library.

#     Args:
#         left_df (pd.DataFrame): The left dataframe.
#         right_df (pd.DataFrame): The right dataframe.
#         settings (str): The path to the settings file.
#         training (str): The path to the training file.
#         write (bool): Indicates whether to write the deduplicated output to file.
#         out (str): The path to the output file.
#     Returns:
#         merged_df (pd.DataFrame): The merged dataframe.
#     """
#     print('Importing data ...')
#     if isinstance(left_df, str):
#         root_l_df = pd.read_csv(left_df)
#         root_r_df = pd.read_csv(right_df)
#     else:
#         root_l_df = left_df.copy()
#         root_r_df = right_df.copy()

#     # Stage 1: Direct products_and_services linkage using merging
#     print("----Start of stage 1----")
#     print('Directly merging data...')
#     # Merge the two dataframes based on the 'products_and_services' column
#     merged_df = root_l_df.merge(root_r_df, on='products_and_services', how='left', suffixes=['_x', '_y']).drop(columns="ID")
#     merged_df = merged_df.merge(root_r_df, left_on='products_id_y', right_on='products_id', how="left").drop(columns=["ID","products_id"])
#     # Create a new dataframe that contains rows from company_based_p_and_s that could not be directly matched
#     non_matched_products = merged_df[merged_df.isna().any(axis=1)].drop(columns=["products_id_y", "products_and_services_y"]).rename(columns={"products_and_services_x": "products_and_services"})
#     # Get the percentage of products_and_services that could be directly matched
#     percentage_matched = len(merged_df.dropna())/len(root_l_df)*100
#     print('Percentage of products_and_services that could be directly matched: {0:.2f}%'.format(percentage_matched))
#     print("----Finished stage 1----\n")

#     # Stage 2: Remaining products_and_services linkage using dedupe
#     print("----Start of stage 2----")
#     print('Preparing record linkage data...')
#     # Convert the dataframes to dictionaries
#     linkage_data_1 = convert_pandas_to_dict(non_matched_products, "left", "linkage")
#     linkage_data_2 = convert_pandas_to_dict(root_r_df, "right", "linkage")
#     print('Attempting products_and_services linkage on the remainder using dedupe...')
#     # Check if a settings file already exists and use if can be found
#     if os.path.exists(settings):
#         print('Settings file found! Reading settings from "{}"'.format(settings))
#         with open(settings, 'rb') as sf:
#             linker = dedupe.StaticRecordLink(sf)
#     # If no settings file exists, create train a new linker object
#     else:
#         # Define the fields that will be used for the record linkage
#         fields = [
#                 {'field': 'products_and_services', 'type': 'String'}] # consider Text type instead of String
        
#         # Create a new linker object and pass the fields to it
#         linker = dedupe.RecordLink(fields)
#         print("Preparing training...")
#         if os.path.exists(training):
#             print('Reading labeled examples from ', training)
#             with open(training) as tf:
#                 linker.prepare_training(linkage_data_1,
#                                         linkage_data_2,
#                                         training_file=tf,
#                                         sample_size=10000)
#         else:
#             # Prepare the linker object for training using the two datasets
#             linker.prepare_training(linkage_data_1, linkage_data_2, sample_size=10000)
#         # Start the active labeling
#         print('Starting active labeling...')
#         dedupe.console_label(linker)
#         # Train the linker object using the active labeling as additional input
#         print("Training...")
#         linker.train()
#         print("Training finished!")
#         # write the labelled training examples to disk
#         with open(training, 'w') as tf:
#             linker.write_training(tf)
#         # write the settings file to disk
#         with open(settings, 'wb') as sf:
#             linker.write_settings(sf)
#     # Perform the record linkage
#     print('Performing linking...')
#     linked_records = linker.join(linkage_data_1, linkage_data_2, 0.0)
#     print('Succesfully linked {} records'.format(len(linked_records)))
#     for _, (cluster, score) in enumerate(linked_records):
#         non_matched_products.loc[int(re.search(r"\d+", cluster[0]).group()), 'products_and_services_y'] = root_r_df.loc[int(re.search(r"\d+", cluster[1]).group()), 'products_and_services']
#         non_matched_products.loc[int(re.search(r"\d+", cluster[0]).group()), 'products_id_y'] = root_r_df.loc[int(re.search(r"\d+", cluster[1]).group()), 'products_id']
    
#     merged_df = merged_df.fillna(non_matched_products)
#     merged_df = merged_df.rename(columns = {"products_id_x": "products_id", 
#                                                                          "products_and_services_x": "automatic_processed_products_and_services",
#                                                                          "products_id_y": "linked_EP_products_id",
#                                                                          "products_and_services_y": "linked_EP_products_and_services"})
#     print("Coverage increased to {0:.2f}%".format(len(merged_df.dropna())/len(root_l_df)*100))
#     print("----Finished stage 2----\n")
#     if write:
#         print('Writing results to "{}"'.format(out))
#         merged_df.to_csv(out, index=False)

#     return merged_df

In [24]:
# def dedup_and_link(df, ep_df_path, out, dedup_settings_file, dedup_training_file, linking_settings_file, linking_training_file):
#     # Start timer
#     print("\n/=========== Dedup x Record Linkage started ===========/")
#     start_time = time.time()
#     # Phase 1: applying deduplication module to the data
#     print("/=========== Start of phase 1: Deduplication ===========/")
#     deduped_data = deduplication(df, dedup_settings_file, dedup_training_file)

#     # Phase 2: applying record linkage module to the data
#     print("\n\n/=========== Start of phase 2: Record Linkage ===========/")
#     linked_data = record_linkage(deduped_data, pd.read_csv(ep_df_path), linking_settings_file, linking_training_file)
#     end_time = time.time()

#     print("\n/=========== Dedup x Record Linkage finished. Duration: {} hours, {} minutes, {} seconds ===========/".format(*seconds_conversion(end_time - start_time)))

#     print('Writing results to "{}"'.format(out))
#     linked_data.to_csv(out, index=False)


## 3.3.1 Duplicate removal and Europages linkage

### Base Data

In [25]:
#dedup_and_link(base_data_file_location, ep_catalogue, base_data_output_file, dedup_settings_file, dedup_training_file, rl_settings_file, rl_training_file)

In [26]:
deduplication(base_data_file_location, dedup_settings_file, dedup_training_file, write = True, out = base_data_deduped_file)

Importing data ...
----Start of stage 1----
Preparing dedupe data ...


Preparing training...


KeyboardInterrupt: 

### new Italy Data

In [None]:
#dedup_and_link(italy_data_file_location, ep_catalogue, italy_output_file, dedup_settings_file, dedup_training_file, rl_settings_file, rl_training_file)

In [None]:
deduplication(italy_data_file_location, dedup_settings_file, dedup_training_file, write = True, out = italy_deduped_file)

Importing data ...
----Start of stage 1----
Preparing dedupe data ...
Settings file found! Reading settings from "../../dedupe_files/dedup_learned_settings"
Clustering..
Clustering finished!. 4 duplicates found
Dropping duplicates...
Duplicates dropped!
----Finished stage 1----
Writing deduplicated output to file...


Unnamed: 0,raw_products_and_services,products_id,automatic_processed_products_and_services
0,persian blue salt,50abde66-58b7-4fb3-a007-1077fa41a010,Persian blue salt.
1,organic saffron bio,0ea739bc-c3fb-4476-b8d6-96ba5085aa00,organic saffron bio.
2,property sale and purchase,126e3677-bb68-481d-96e0-ae0fd8a68db3,Property sale and purchase
3,tomato processing machines,46b2f19e-f238-4b99-be02-d0fbe46d1e4c,Tomato processing machines.
4,christmas gift packages,6ced5115-29ba-4477-87e9-93699dce44c9,Christmas gift packages
...,...,...,...
3430,stampa offset e digitale,f6ebf929-0e99-4ff0-8f83-e46a30520a47,offset and digital printing
3431,pile turner,6762bafd-8627-4894-a113-686adbc2602a,pile turner
3432,synthetic fibre yarns,856223ec-54f2-44ec-ab37-3c438164fa6b,synthetic fibre yarns
3433,polyester microfibre,2d10565b-21f6-4687-b49c-cf06e0fedabd,polyester microfibre
