# 3. Deduplication

## 3.1 Import the required libaries

In [177]:
from unidecode import unidecode
import pandas as pd
import numpy as np
import datetime
import dedupe
import csv
import re
import os
import time

## 3.2 Load Data

### Mandatory input files

In [176]:
ep_catalogue= "../../data/example_data/input/scraped_EP_products_catalogue.csv"
dedup_settings_file = '../../dedupe_files/dedup_learned_settings'
dedup_training_file = '../../dedupe_files/dedup_training.json'
rl_settings_file = '../../dedupe_files/record_linkage_learned_settings'
rl_training_file = '../../dedupe_files/record_linkage_training.json'

### Base Data

In [174]:
base_data_file_location ="../../data/example_data/output/base_data/products_and_services_processed.csv"
base_data_output_file = '../../data/example_data/output/base_data/linked_data.csv'

### new Italy Data

In [175]:
italy_data_file_location = "../../data/example_data/output/italy_data/italy_translated_products.csv"
italy_deduped_file = "../../data/example_data/output/italy_data/italy_deduped.csv"
italy_output_file = '../../data/example_data/output/italy_data/linked_data.csv'

## 3.3 Deduplication

### Helper functions

In [173]:
def preProcess(column):
    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column

In [192]:
def convert_pandas_to_dict(dataframe, which = "None", type = "dedup"):
    data_d = {}
    for i, row in dataframe.iterrows():
        x = zip(row.index, row.values)
        clean_row = dict([(k, preProcess(str(v))) for (k, v) in x])
        if type != "dedup":
            data_d[which + str(i)] = clean_row
        else:
            data_d[i] = dict(clean_row)
    return data_d

In [171]:
def seconds_conversion(seconds):
    # Convert the time difference to a timedelta object
    time_delta = datetime.timedelta(seconds=seconds)

    # Extract the hours, minutes, and seconds from the timedelta object
    hours = time_delta.seconds // 3600
    minutes = (time_delta.seconds % 3600) // 60
    seconds = time_delta.seconds % 60
    return (hours, minutes, seconds)

In [211]:
x = 

In [219]:
print("\n\n/=========== Dedup x Record Linkage finished. Duration: {} hours, {} minutes, {} seconds ===========/".format(*seconds_conversion(101)))





### Dedupe modules

#### Deduplication module

In [205]:
def deduplication(file, settings, training, write = False, out = "None"):
   # Read the csv files
   print('Importing data ...')
   df = pd.read_csv(file)

   # stage 1: Deduplication using dedupe library
   print("----Start of stage 1----")
   print('Preparing dedupe data ...')
   dedup_data = convert_pandas_to_dict(df, "dedup")

   if os.path.exists(settings):
      print('Settings file found! Reading settings from "{}"'.format(settings))
      with open(settings, 'rb') as sf:
         deduper = dedupe.StaticDedupe(sf)
   # If no settings file exists, create train a new linker object
   else:
      # Define the fields that will be used for the record linkage
      fields = [
               {'field': 'products_and_services', 'type': 'String'}] # consider Text type instead of String
      
      # Create a new linker object and pass the fields to it
      deduper = dedupe.Dedupe(fields)
      print("Preparing training...")
      if os.path.exists(training):
         print('Reading labeled examples from ', training)
         with open(training) as tf:
               deduper.prepare_training(dedup_data,
                                       training_file=tf)
      else:
         # Prepare the linker object for training using the two datasets
         deduper.prepare_training(dedup_data)
      # Start the active labeling
      print('Starting active labeling...')
      dedupe.console_label(deduper)
      # Train the linker object using the active labeling as additional input
      print("Training...")
      deduper.train()
      print("Training finished!")
      # write the labelled training examples to disk
      with open(training, 'w') as tf:
         deduper.write_training(tf)
      # write the settings file to disk
      with open(settings, 'wb') as sf:
         deduper.write_settings(sf)

   print('Clustering..')
   clustered_dupes = deduper.partition(dedup_data, 0.5)
   print('Clustering finished!. {} duplicates found'.format(len(df)-len(clustered_dupes)))

   print('Dropping duplicates...')
   rows_to_drop = []
   for _, (records, scores) in enumerate(clustered_dupes):
      rows_to_drop.append(records[1:])

   # flatten list of lists
   rows_to_drop = [item for sublist in rows_to_drop for item in sublist]
   df = df.drop(df.index[rows_to_drop])
   
   print ("Duplicates dropped!")
   print("----Finished stage 1----")
   
   if write: 
      print('Writing deduplicated output to file...')
      df.to_csv(out, index=False)

   return df

#### Record Linkage module

In [202]:
def record_linkage(left_df, right_df, settings, training, write = False, out = "None"):
    root_l_df = left_df.copy()
    root_r_df = right_df.copy()
    if isinstance(left_df, str):
        print('Importing data ...')
        root_l_df = pd.read_csv(left_df)
        root_r_df = pd.read_csv(right_df)

    # Stage 1: Direct products_and_services linkage using merging
    print("----Start of stage 1----")
    print('Directly merging data...')
    # Merge the two dataframes based on the 'products_and_services' column
    merged_df = root_l_df.merge(root_r_df, on='products_and_services', how='left', suffixes=['_x', '_y']).drop(columns="ID")
    merged_df = merged_df.merge(root_r_df, left_on='products_id_y', right_on='products_id', how="left").drop(columns=["ID","products_id"])
    # Create a new dataframe that contains rows from company_based_p_and_s that could not be directly matched
    non_matched_products = merged_df[merged_df.isna().any(axis=1)].drop(columns=["products_id_y", "products_and_services_y"]).rename(columns={"products_and_services_x": "products_and_services"})
    # Get the percentage of products_and_services that could be directly matched
    percentage_matched = len(merged_df.dropna())/len(root_l_df)*100
    print('Percentage of products_and_services that could be directly matched: {0:.2f}%'.format(percentage_matched))
    print("----Finished stage 1----\n")

    # Stage 2: Remaining products_and_services linkage using dedupe
    print("----Start of stage 2----")
    print('Preparing record linkage data...')
    # Convert the dataframes to dictionaries
    linkage_data_1 = convert_pandas_to_dict(non_matched_products, "left", "linkage")
    linkage_data_2 = convert_pandas_to_dict(root_r_df, "right", "linkage")
    print('Attempting products_and_services linkage on the remainder using dedupe...')
    # Check if a settings file already exists and use if can be found
    if os.path.exists(settings):
        print('Settings file found! Reading settings from "{}"'.format(settings))
        with open(settings, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf)
    # If no settings file exists, create train a new linker object
    else:
        # Define the fields that will be used for the record linkage
        fields = [
                {'field': 'products_and_services', 'type': 'String'}] # consider Text type instead of String
        
        # Create a new linker object and pass the fields to it
        linker = dedupe.RecordLink(fields)
        print("Preparing training...")
        if os.path.exists(training):
            print('Reading labeled examples from ', training)
            with open(training) as tf:
                linker.prepare_training(linkage_data_1,
                                        linkage_data_2,
                                        training_file=tf,
                                        sample_size=10000)
        else:
            # Prepare the linker object for training using the two datasets
            linker.prepare_training(linkage_data_1, linkage_data_2, sample_size=10000)
        # Start the active labeling
        print('Starting active labeling...')
        dedupe.console_label(linker)
        # Train the linker object using the active labeling as additional input
        print("Training...")
        linker.train()
        print("Training finished!")
        # write the labelled training examples to disk
        with open(training, 'w') as tf:
            linker.write_training(tf)
        # write the settings file to disk
        with open(settings, 'wb') as sf:
            linker.write_settings(sf)
    # Perform the record linkage
    print('Performing linking...')
    linked_records = linker.join(linkage_data_1, linkage_data_2, 0.0)
    print('Succesfully linked {} records'.format(len(linked_records)))
    for _, (cluster, score) in enumerate(linked_records):
        non_matched_products.loc[int(re.search(r"\d+", cluster[0]).group()), 'products_and_services_y'] = root_r_df.loc[int(re.search(r"\d+", cluster[1]).group()), 'products_and_services']
        non_matched_products.loc[int(re.search(r"\d+", cluster[0]).group()), 'products_id_y'] = root_r_df.loc[int(re.search(r"\d+", cluster[1]).group()), 'products_id']
    
    merged_df = merged_df.fillna(non_matched_products).rename(columns = {"products_and_services_y": "EuroPages products_and_services", "products_id_y": "EuroPages products_id"})
    print("Coverage increased to {0:.2f}%".format(len(merged_df.dropna())/len(root_l_df)*100))
    print("----Finished stage 2----\n")
    if write:
        print('Writing results to "{}"'.format(out))
        merged_df.to_csv(out, index=False)

    return merged_df

In [220]:
def dedup_and_link(df, ep_df_path, out, dedup_settings_file, dedup_training_file, linking_settings_file, linking_training_file):
    # Start timer
    print("\n/=========== Dedup x Record Linkage started ===========/")
    start_time = time.time()
    # Phase 1: applying deduplication module to the data
    print("/=========== Start of phase 1: Deduplication ===========/")
    deduped_data = deduplication(df, dedup_settings_file, dedup_training_file)

    # Phase 2: applying record linkage module to the data
    print("\n\n/=========== Start of phase 2: Record Linkage ===========/")
    linked_data = record_linkage(deduped_data, pd.read_csv(ep_df_path), linking_settings_file, linking_training_file)
    end_time = time.time()

    print("\n\n/=========== Dedup x Record Linkage finished. Duration: {} hours, {} minutes, {} seconds ===========/".format(*seconds_conversion(end_time - start_time)))

    print('Writing results to "{}"'.format(out))
    linked_data.to_csv(out, index=False)


## 3.3.1 Duplicate removal

### Base Data

### new Italy Data

In [221]:
dedup_and_link(italy_data_file_location, ep_catalogue, italy_output_file, dedup_settings_file, dedup_training_file, rl_settings_file, rl_training_file)


Importing data ...
----Start of stage 1----
Preparing dedupe data ...
Settings file found! Reading settings from "../../dedupe_files/dedup_learned_settings"
Clustering..
Clustering finished!. 4176 duplicates found
Dropping duplicates...
Duplicates dropped!
----Finished stage 1----


----Start of stage 1----
Directly merging data...
Percentage of products_and_services that could be directly matched: 0.25%
----Finished stage 1----

----Start of stage 2----
Preparing record linkage data...
Attempting products_and_services linkage on the remainder using dedupe...
Settings file found! Reading settings from "../../dedupe_files/record_linkage_learned_settings"
Performing linking...
Succesfully linked 8018 records
Coverage increased to 77.34%
----Finished stage 2----



Writing results to "../../data/example_data/output/italy_data/linked_data.csv"


## 3.3.2 Linking to EuroPages activity catalogue

### Base data

In [None]:
record_linkage(base_data_file_location, ep_catalogue, base_data_output_file, rl_settings_file, rl_training_file)

### new Italy Data

In [191]:
record_linkage(italy_deduped_file, ep_catalogue, italy_output_file, rl_settings_file, rl_training_file)

Importing data ...


FileNotFoundError: [Errno 2] No such file or directory: '../../data/example_data/output/italy_data/italy_deduped.csv'