# Deduplication
Verified dataset
Creates embeddings using SentenceTransformer and the 'all-MiniLM-L6-v2' model.

Creates a FAISS index using those embeddings.

For each embedding searches for close neighbour embeddings.

Duplicates within 'bias_term' subsets are removed.

The deduplicated ids are written to CSV

In [1]:
!pip install -q datasets
!pip install -U sentence-transformers
!pip install faiss-gpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 

In [2]:
from datasets import load_dataset

hf_site_id = '2024-mcm-everitt-ryan'
dataset_id = f'{hf_site_id}/job-bias-synthetic-human-verified'

dataset = load_dataset(dataset_id)


Downloading readme:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/988k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1256 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/314 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1046 [00:00<?, ? examples/s]

In [3]:
import os
try:
    import polars as pl
except Exception:
    !pip install polars
    import polars as pl

import pandas as pd

df_train = dataset['train'].to_pandas()
df_val = dataset['val'].to_pandas()
df_test = dataset['test'].to_pandas()

# Keep lists of doc ids so that the combined and deduped dataset can be later split.
ids_train = df_train['id'].tolist()
ids_val = df_val['id'].tolist()
ids_test = df_test['id'].tolist()

combined_df = pd.concat([df_val, df_test, df_train])
df = pl.from_pandas(combined_df)


In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
# Supress warnings since they can get verbose.
import warnings
import faiss
import time

In [5]:
data = df.filter(df['text'].is_not_null() & (df['text'] != ''))

In [6]:
warnings.filterwarnings('ignore')

# Create a SentenceTransformer using model 'all-MiniLM-L6-v2'
# For a comparison of models see https://www.sbert.net/docs/pretrained_models.html
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to encode text to embeddings
def encode_text(texts):
    '''
    Returns a list of encodings
    Arguments:
    texts   - a list of text to be encoded
    '''
    return model.encode(texts, show_progress_bar=False)

def get_embeddings(df):
    '''
    Returns a polars dataframe with an 'embeddings' column.
    Arguments:
    df  - a dataframe containing a 'text' column which contains the text to be encoded.
    '''

    #Divide the df into chunks (size doesn't seem to make much diff)
    #For each chunk, get embeddings list and attach to the df

    chunk_size=5000
    chunks = []
    num_chunks = np.ceil(len(df) / chunk_size)
    for i in range(int(num_chunks)):
        chunk = df[i * chunk_size:(i + 1) * chunk_size]
        embeddings = encode_text(chunk['text'].to_list())
        chunk = chunk.with_columns(pl.Series("embeddings",list(embeddings)))
        chunks.append(chunk)
    return pl.concat(chunks)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
nNN=10    #the number of near neighbours to return in each search
def search(index, df, num_sample_results=0, threshold=0.3):
    '''
    Searches the given dataframe 'enbeddings' column using the given FAISS index.
    Returns a list of positions to be removed and (optionally) a subset of sample results (for test purposes)
    Arguments:
    index               - the previously constructed FAISS index
    df                  - dataframe with 'embeddings' column
    num_sample_results  - the number of sample results to return (default=0)
    threshold           - the distance below which search results will be regarded as close neighbours (default=0.01)
    '''

    #It's inefficient to search one-by-one. FAISS is designed to handle a batch of requests.
    #But to compare returned distances against the threshold, need to go one-by one

    n = df.shape[0]
    batch_size = 1000  # Define the size of each batch
    threshold_distance = threshold  # Distance threshold for "near-duplicates"
    to_remove = set()  # This will store indices of duplicates
    sample_results = []

    # Process in batches
    for start_idx in range(0, n, batch_size):
        end_idx = min(start_idx + batch_size, n)

        # TODO remove the sleep when finished testing
        print(f'\rProcessing batch starting at index: {start_idx} End index: {end_idx} Size of remove list: {len(to_remove)}', end='')
        time.sleep(0.05)  # Sleep for 1 second to visibly see the changes

        # Execute the embedding search
        batch_embeddings = np.vstack(df['embeddings'][start_idx:end_idx])
        distances, indices = index.search(batch_embeddings, k=nNN)

        # Iterate over each result in the batch
        for i in range(distances.shape[0]):
            current_idx = start_idx + i
            #ignore the results if the search index itself has already been marked for removal
            if current_idx in to_remove:
                continue  # Skip if already marked for removal

            # Check for near-duplicates within the threshold distance
            for dist, idx in zip(distances[i], indices[i]):
                idx = int(idx)
                near_neighbours = 0
                if dist < threshold_distance and idx != current_idx:
                    to_remove.add(idx)  # Mark for removal
                    near_neighbours +=1
                    if (near_neighbours == nNN):
                        # All k returned are under the threshold - should use a bigger value for k
                        print("Warning! Consider increasing number of neighbours returned by FAISS index search.")

                    # Test purposes - save a sample of results for later examination
                    if len(sample_results) < num_sample_results:
                        current_text = df['text'][current_idx]
                        current_id = df['id'][current_idx]
                        near_duplicate_text = df['text'][idx]
                        near_duplicate_id = df['id'][idx]
                        sample_results.append({'Original_Index': current_idx,
                            'Original_ID' : current_id,
                            'Original_Text': current_text,
                            'Near_Duplicate_Index': idx,
                            'Near_Duplicate_ID' : near_duplicate_id,
                            'Near_Duplicate_Text': near_duplicate_text,
                            'Distance': dist})

    return to_remove, sample_results


In [8]:
dim = None

In [9]:
def get_duplicates(df, num_sample_results=0):
    '''
    Gets the positions of duplicates of the given df with a column 'text' that contains the text to be tested
    First creates the embeddings for the df. Then creates a FAISS index, and executes the search
    Returns a list of positions to be removed and (optionally) a subset of sample results (for test purposes)

    Arguments:
    df  - a dataframe with text in a 'text' column
    num_sample_results  - the number of sample results to return (default=0)
    '''

    df_embeddings = get_embeddings(df)

    global dim
    if dim == None:
        dim = len(df_embeddings.get_column("embeddings")[0])
        print (f"Determined dimension:{dim}")
#    dim = 384

    # Get the embeddings as a 2D array
    embeddings = np.vstack(df_embeddings['embeddings'].apply(lambda x: np.array(x, dtype='float32')).to_list())

    # Create a FAISS index - see https://www.pinecone.io/learn/series/faiss/faiss-tutorial/
    # How this is done depends on the size of the search. See above tutorial
    if len(df_embeddings) <= 10000:
        #No need for voronoi optimisation
        index = faiss.IndexFlatL2(dim)
    else:
        #For performance reasons need to first subdivide into voronoi cells
        quantizer = faiss.IndexFlatL2(dim)
        nlist = 100 #the number of voronoi cells
        if len(df_embeddings) > 100000:
            #This is a bit arbitrary but seems to work ok
            nlist = 1000
        index = faiss.IndexIVFFlat(quantizer, dim, nlist)
        index.train(embeddings)

    # Add embeddings to the new FAISS index
    index.add(embeddings)

    # Execute the search and return the positions to be remove and sample results
    to_remove, sample_results = search(index, df_embeddings, num_sample_results=num_sample_results)
    return to_remove, sample_results


Execute the search.

We first divide by 'bias_term' - the embedding creation and subsequent search is done only on the 'bias_term' subset.

In [10]:
print (f"Length of original dataframe:{len(data)}")

# For testing return a sample of results. Need only be non-zero for testing purposes
num_samples_per_term=500

start = time.time()

dd = []
sample_results_all = []
# Get duplicates for the df subset
to_remove, sample_results = get_duplicates(data, num_samples_per_term)
print(f" Number of duplicate terms identified {len(to_remove)}")

# Filter out the positions identified for removal
df_dd = data[[ix for ix in range(len(data)) if ix not in to_remove]]

# Add the sample results to the global sample results
sample_results_all += sample_results

df_dup = pl.DataFrame(sample_results_all)
print (f"Length of deduplicated dataframe:{len(df_dd)}")

end = time.time()
print(f"Search execution time: {end - start}")

Length of original dataframe:2616
Determined dimension:384
Processing batch starting at index: 2000 End index: 2616 Size of remove list: 30 Number of duplicate terms identified 32
Length of deduplicated dataframe:2584
Search execution time: 5.673119306564331


Export the deduplicated dataframe

In [11]:
# filter the df_dd polars dataframe to rows where id column is in ids_train

ids_dd_train = df_dd.filter(pl.col("id").is_in(ids_train))['id'].to_list()
ids_dd_val = df_dd.filter(pl.col("id").is_in(ids_val))['id'].to_list()
ids_dd_test = df_dd.filter(pl.col("id").is_in(ids_test))['id'].to_list()


In [12]:
# get a list of ids that are in ids_train list and not in ids_dd_train list

train_dup_ids = [id for id in ids_train if id not in ids_dd_train]
val_dup_ids = [id for id in ids_val if id not in ids_dd_val]
test_dup_ids = [id for id in ids_test if id not in ids_dd_test]


In [13]:
print (f"Number of duplicate train ids: {len(train_dup_ids)}")
print (f"Number of duplicate val ids: {len(val_dup_ids)}")
print (f"Number of duplicate test ids: {len(test_dup_ids)}")


Number of duplicate train ids: 23
Number of duplicate val ids: 2
Number of duplicate test ids: 7


In [14]:
#Export the train ids to csv

import polars as pl

# Convert the list of train IDs to a Polars DataFrame
df_train_ids = pl.DataFrame({"id": train_dup_ids})

# Export the DataFrame to a CSV file
df_train_ids.write_csv("train_dup_ids.csv")


In [15]:
df_test_ids = pl.DataFrame({"id": test_dup_ids})

# Export the DataFrame to a CSV file
df_test_ids.write_csv("test_dup_ids.csv")

In [18]:
if len(df_dup) > 0:
    df_dup.write_json("duplicate-samples.json")

In [17]:
df_dup

Original_Index,Original_ID,Original_Text,Near_Duplicate_Index,Near_Duplicate_ID,Near_Duplicate_Text,Distance
i64,str,str,i64,str,str,f64
9,"""Kaggle::techma…","""Graduate Teach…",47,"""Kaggle::techma…","""Graduate Teach…",0.16521
20,"""Kaggle::techma…","""Provide mentor…",2146,"""Kaggle::techma…","""Qualifications…",0.263974
38,"""Kaggle::techma…","""Role Responsib…",43,"""Kaggle::techma…","""Description M…",0.035165
39,"""Synthetic:gpt-…","""Company Backgr…",927,"""Synthetic:gpt-…","""Company: Houst…",0.286639
66,"""Synthetic:meta…","""Company: Johns…",2036,"""Synthetic:meta…","""Company Backgr…",0.279248
80,"""Synthetic:gpt-…","""Animal Breeder…",1981,"""Synthetic:gpt-…","""Company Backgr…",0.252409
123,"""Kaggle::techma…","""Proven operati…",510,"""Kaggle::techma…","""Proven operati…",0.008866
146,"""Kaggle::techma…","""Requirements: …",2353,"""Kaggle::techma…","""Requirements …",0.201041
197,"""Synthetic:meta…","""Job Title: Pul…",386,"""Synthetic:meta…","""Company: Nicho…",0.229251
269,"""Synthetic:gpt-…","""Company Backgr…",1956,"""Synthetic:gpt-…","""Company Backgr…",0.291669


In [21]:
# prompt: export each row of the df_dup to a separate text file

for index in range(df_dup.height):
  filename = f"text_{index}.txt"
  with open(filename, "w") as f:
    for col in df_dup.columns: # Iterate through each column
        f.write(f"{col}: {df_dup[col][index]}\n") # Write column name and value

In [22]:
# prompt: wrap each of the text_*.txt files in a zip

!zip -r text_files.zip text_*.txt


  adding: text_0.txt (deflated 65%)
  adding: text_10.txt (deflated 61%)
  adding: text_11.txt (deflated 61%)
  adding: text_12.txt (deflated 68%)
  adding: text_13.txt (deflated 70%)
  adding: text_14.txt (deflated 70%)
  adding: text_15.txt (deflated 70%)
  adding: text_16.txt (deflated 59%)
  adding: text_17.txt (deflated 63%)
  adding: text_18.txt (deflated 55%)
  adding: text_19.txt (deflated 56%)
  adding: text_1.txt (deflated 60%)
  adding: text_20.txt (deflated 59%)
  adding: text_21.txt (deflated 60%)
  adding: text_22.txt (deflated 60%)
  adding: text_23.txt (deflated 59%)
  adding: text_24.txt (deflated 59%)
  adding: text_25.txt (deflated 67%)
  adding: text_26.txt (deflated 59%)
  adding: text_27.txt (deflated 59%)
  adding: text_28.txt (deflated 59%)
  adding: text_29.txt (deflated 58%)
  adding: text_2.txt (deflated 74%)
  adding: text_30.txt (deflated 72%)
  adding: text_31.txt (deflated 66%)
  adding: text_3.txt (deflated 59%)
  adding: text_4.txt (deflated 58%)
  addi