# Data Deduplication and Cleaning Techniques

This notebook covers multiple methods to identify and remove duplicate or similar records using techniques such as hashing, fuzzy matching, cosine similarity, and clustering.

## BASIC METHOD

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("complex_sales_data.csv")


# Check for duplicate records
print("Exact Duplicates:", df.duplicated().sum())
print("Duplicate Order_IDs:", df.duplicated(subset=['Order_ID']).sum())
print("Duplicate Customer Orders:", df.duplicated(subset=['Customer_Name', 'Product_Name', 'Order_Date']).sum())


Exact Duplicates: 1
Duplicate Order_IDs: 5
Duplicate Customer Orders: 2


In [32]:
# Trim spaces and convert names to lowercase
df["Customer_Name"] = df["Customer_Name"].str.strip().str.lower()

# Standardize date format
df["Order_Date"] = pd.to_datetime(df["Order_Date"], errors="coerce", dayfirst=True)  # Handle different formats

# Display cleaned data
print(df.head())


   Order_ID       Customer_Name Product_Category Product_Name  Quantity  \
0  0844e15d      jonathan perez      Electronics   Smartphone         2   
1  b3472ab1      william nelson        Groceries         Eggs         5   
2  d1dbe108     lawrence arnold        Groceries        Bread         2   
3  42ec140d  dr. robert johnson           Beauty    Sunscreen         2   
4  c8266aea           juan snow             Toys         Doll         8   

   Price_Per_Unit  Total_Amount Order_Date  
0          435.64        871.28 2024-11-24  
1          132.37        661.85 2023-10-29  
2          237.93        475.86 2024-05-15  
3          249.63        499.26 2023-05-31  
4          450.45       3603.60 2024-05-17  


  df["Order_Date"] = pd.to_datetime(df["Order_Date"], errors="coerce", dayfirst=True)  # Handle different formats


In [33]:
df = df.drop_duplicates()
print("After removing exact duplicates:", df.shape)


After removing exact duplicates: (9, 8)


In [34]:
df = df.groupby(["Customer_Name", "Product_Name", "Order_Date"], as_index=False).agg({
    "Quantity": "sum",
    "Total_Amount": "sum",
    "Price_Per_Unit": "first",  # Assuming price remains the same
    "Order_ID": "first"  # Keep one order ID
})
print("After merging partial duplicates:", df.shape)


After merging partial duplicates: (7, 7)


### fuzzywuzzy

In [35]:
from fuzzywuzzy import fuzz, process

# Function to find and replace similar names
def standardize_names(df, threshold=85):
    unique_names = df["Customer_Name"].unique()
    name_mapping = {}

    for name in unique_names:
        match = process.extractOne(name, unique_names, scorer=fuzz.token_sort_ratio)
        if match[1] >= threshold and match[0] != name:
            name_mapping[name] = match[0]

    df["Customer_Name"] = df["Customer_Name"].replace(name_mapping)
    return df

# Apply fuzzy deduplication
df = standardize_names(df)
print("After fixing fuzzy duplicates:", df.shape)


After fixing fuzzy duplicates: (7, 7)




In [36]:
df.to_csv("cleaned_sales_data.csv", index=False)
print("Cleaned data saved successfully!")


Cleaned data saved successfully!


## Advance methods

#### Dataset

In [10]:
import pandas as pd
data = pd.read_csv("steam-200k.csv")
data.columns = ["id","game_title","use_type","play_hr","x"]

data_cosine = data

In [11]:
data.head(10)

Unnamed: 0,id,game_title,use_type,play_hr,x
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0
5,151603712,Fallout New Vegas,purchase,1.0,0
6,151603712,Fallout New Vegas,play,12.1,0
7,151603712,Left 4 Dead 2,purchase,1.0,0
8,151603712,Left 4 Dead 2,play,8.9,0
9,151603712,HuniePop,purchase,1.0,0


#### Hash-Based Deduplication


In [12]:
import hashlib

# Define columns to hash
cols_to_hash = ['id', 'game_title']

# Generate hash for each row based on specific columns
data['hash'] = data.apply(lambda x: hashlib.md5(''.join(str(x[col]) for col in cols_to_hash).encode()).hexdigest(), axis=1)

# Sort so that 'play' comes before 'purchase'
data.sort_values(by='use_type', ascending=False, inplace=True)

# Count duplicates (excluding the first occurrence, which is 'play' due to sorting)
duplicate_count = data.duplicated(subset=['hash']).sum()
print(f"Total duplicates found: {duplicate_count}")

# Drop duplicates, keeping the 'play' version
data = data.drop_duplicates(subset=['hash']).drop(columns=['hash'])




Total duplicates found: 71195


In [13]:
# Show the result
pd.DataFrame(data)

Unnamed: 0,id,game_title,use_type,play_hr,x
65779,61506388,Burnout Paradise The Ultimate Box,purchase,1.0,0
136815,83770731,Warlock - Master of the Arcane Powerful Lords,purchase,1.0,0
87392,190022459,7 Days to Die,purchase,1.0,0
136812,83770731,Toybox Turbos,purchase,1.0,0
87390,190022459,Goat Simulator,purchase,1.0,0
...,...,...,...,...,...
43653,152959594,Dirty Bomb,purchase,1.0,0
44657,83849502,Medieval II Total War,purchase,1.0,0
184120,214913026,Tactical Intervention,purchase,1.0,0
44633,83849502,Deadlight Original Soundtrack,purchase,1.0,0


In [14]:
# Check for duplicate records
print("Exact Duplicates:", data.duplicated().sum())
# Check for duplicate game_title values within each id group
duplicates_within_id = data.duplicated(subset=['id', 'game_title'])
print("Duplicate game_title within each id:", duplicates_within_id.sum())



Exact Duplicates: 0
Duplicate game_title within each id: 0


####  Cosine Similarity for Text Deduplication

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Load your data
# data = pd.read_csv("your_data.csv")

# Example simulated dataset
data = pd.DataFrame({
    'id': np.random.randint(1, 21, 10000),
    'game_title': ['game title ' + str(i % 100) for i in range(10000)]
})

# Get unique ids and chunk size (25%)
unique_ids = data['id'].unique()
num_ids = len(unique_ids)
chunk_size = max(1, int(0.25 * num_ids))

# Final result
final_df = pd.DataFrame(columns=data.columns)

# Threshold for cosine similarity (1 - similarity since NearestNeighbors uses distance)
similarity_threshold = 0.25

for start in range(0, num_ids, chunk_size):
    id_chunk = unique_ids[start:start + chunk_size]
    chunk_data = data[data['id'].isin(id_chunk)].copy()

    if chunk_data.shape[0] < 2:
        final_df = pd.concat([final_df, chunk_data], ignore_index=True)
        continue

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunk_data["game_title"])

    # Fit Nearest Neighbors with cosine metric
    nn = NearestNeighbors(metric='cosine', radius=similarity_threshold)
    nn.fit(tfidf_matrix)

    # Find all pairs within the similarity threshold
    distances, indices = nn.radius_neighbors(tfidf_matrix)

    duplicate_indices = set()
    for i, neighbors in enumerate(indices):
        for j in neighbors:
            if i < j:
                duplicate_indices.add(j)

    # Drop identified duplicates
    chunk_data = chunk_data.drop(chunk_data.index[list(duplicate_indices)])

    final_df = pd.concat([final_df, chunk_data], ignore_index=True)

# Show deduplicated result
print(final_df)


#####  Clustering-Based Deduplication (DBSCAN)

In [None]:
#####  Clustering-Based Deduplication (DBSCAN)

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
import numpy as np

data = pd.read_csv("steam-200k.csv")
data.columns = ["id","game_title","use_type","play_hr","x"]



# Encode categorical values as numbers
encoder = LabelEncoder()
data["Customer_Encoded"] = encoder.fit_transform(data["game_title"])

# Apply DBSCAN clustering
clustering = DBSCAN(eps=1, min_samples=2).fit(np.array(data["Customer_Encoded"]).reshape(-1, 1))
data["Cluster"] = clustering.labels_

# Remove records that belong to the same cluster
data = data[data["Cluster"] == -1].drop(columns=["Customer_Encoded", "Cluster"])

print(data)

