In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv("complex_sales_data.csv")

# Display basic info
print(df.info())
print(df.head())

# Check for duplicate records
print("Exact Duplicates:", df.duplicated().sum())
print("Duplicate Order_IDs:", df.duplicated(subset=['Order_ID']).sum())
print("Duplicate Customer Orders:", df.duplicated(subset=['Customer_Name', 'Product_Name', 'Order_Date']).sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Order_ID          10 non-null     object 
 1   Customer_Name     10 non-null     object 
 2   Product_Category  10 non-null     object 
 3   Product_Name      10 non-null     object 
 4   Quantity          10 non-null     int64  
 5   Price_Per_Unit    10 non-null     float64
 6   Total_Amount      10 non-null     float64
 7   Order_Date        10 non-null     object 
dtypes: float64(2), int64(1), object(5)
memory usage: 768.0+ bytes
None
   Order_ID       Customer_Name Product_Category Product_Name  Quantity  \
0  0844e15d      Jonathan Perez      Electronics   Smartphone         2   
1  b3472ab1      William Nelson        Groceries         Eggs         5   
2  d1dbe108     Lawrence Arnold        Groceries        Bread         2   
3  42ec140d  Dr. Robert Johnson           Beauty    Sunsc

In [4]:
# Trim spaces and convert names to lowercase
df["Customer_Name"] = df["Customer_Name"].str.strip().str.lower()

# Standardize date format
df["Order_Date"] = pd.to_datetime(df["Order_Date"], errors="coerce", dayfirst=True)  # Handle different formats

# Display cleaned data
print(df.head())


   Order_ID       Customer_Name Product_Category Product_Name  Quantity  \
0  0844e15d      jonathan perez      Electronics   Smartphone         2   
1  b3472ab1      william nelson        Groceries         Eggs         5   
2  d1dbe108     lawrence arnold        Groceries        Bread         2   
3  42ec140d  dr. robert johnson           Beauty    Sunscreen         2   
4  c8266aea           juan snow             Toys         Doll         8   

   Price_Per_Unit  Total_Amount Order_Date  
0          435.64        871.28 2024-11-24  
1          132.37        661.85 2023-10-29  
2          237.93        475.86 2024-05-15  
3          249.63        499.26 2023-05-31  
4          450.45       3603.60 2024-05-17  


  df["Order_Date"] = pd.to_datetime(df["Order_Date"], errors="coerce", dayfirst=True)  # Handle different formats


In [5]:
df = df.drop_duplicates()
print("After removing exact duplicates:", df.shape)


After removing exact duplicates: (9, 8)


In [6]:
df = df.groupby(["Customer_Name", "Product_Name", "Order_Date"], as_index=False).agg({
    "Quantity": "sum",
    "Total_Amount": "sum",
    "Price_Per_Unit": "first",  # Assuming price remains the same
    "Order_ID": "first"  # Keep one order ID
})
print("After merging partial duplicates:", df.shape)


After merging partial duplicates: (7, 7)


In [8]:
from fuzzywuzzy import fuzz, process

# Function to find and replace similar names
def standardize_names(df, threshold=85):
    unique_names = df["Customer_Name"].unique()
    name_mapping = {}

    for name in unique_names:
        match = process.extractOne(name, unique_names, scorer=fuzz.token_sort_ratio)
        if match[1] >= threshold and match[0] != name:
            name_mapping[name] = match[0]

    df["Customer_Name"] = df["Customer_Name"].replace(name_mapping)
    return df

# Apply fuzzy deduplication
df = standardize_names(df)
print("After fixing fuzzy duplicates:", df.shape)


After fixing fuzzy duplicates: (7, 7)




In [9]:
df.to_csv("cleaned_sales_data.csv", index=False)
print("Cleaned data saved successfully!")


Cleaned data saved successfully!


## Advance methods

##### dataset

In [10]:
import pandas as pd
import numpy as np
import random
import hashlib

# Set seed for reproducibility
random.seed(42)

# Generate random sample data
data = [
    {"Customer_Name": "John Doe", "Email": "johndoe@gmail.com", "Address": "123 Main St", "Product": "Laptop", "Description": "Powerful laptop with 16GB RAM", "Price": 1000},
    {"Customer_Name": "J. Doe", "Email": "jdoe@gmail.com", "Address": "123 Main Street", "Product": "Laptop", "Description": "16GB RAM high-performance laptop", "Price": 1000},
    {"Customer_Name": "Jane Smith", "Email": "janesmith@yahoo.com", "Address": "456 Oak Rd", "Product": "Smartphone", "Description": "5G smartphone with great battery", "Price": 700},
    {"Customer_Name": "Jane S.", "Email": "jsmith@yahoo.com", "Address": "456 Oak Road", "Product": "Smartphone", "Description": "Smartphone with 5G & long battery life", "Price": 700},
    {"Customer_Name": "Alice Brown", "Email": "alice.brown@outlook.com", "Address": "789 Pine Ave", "Product": "Headphones", "Description": "Noise-canceling headphones", "Price": 300},
    {"Customer_Name": "Alice B.", "Email": "aliceb@outlook.com", "Address": "789 Pine Avenue", "Product": "Headphones", "Description": "Premium noise-canceling over-ear headphones", "Price": 300},
]

# Duplicate some records (introducing exact duplicates)
data.extend(random.sample(data, 3))

# Convert to DataFrame
df = pd.DataFrame(data)

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Display dataset
print(df)


  Customer_Name                    Email          Address     Product  \
0   Alice Brown  alice.brown@outlook.com     789 Pine Ave  Headphones   
1   Alice Brown  alice.brown@outlook.com     789 Pine Ave  Headphones   
2      Alice B.       aliceb@outlook.com  789 Pine Avenue  Headphones   
3    Jane Smith      janesmith@yahoo.com       456 Oak Rd  Smartphone   
4        J. Doe           jdoe@gmail.com  123 Main Street      Laptop   
5      Alice B.       aliceb@outlook.com  789 Pine Avenue  Headphones   
6       Jane S.         jsmith@yahoo.com     456 Oak Road  Smartphone   
7      John Doe        johndoe@gmail.com      123 Main St      Laptop   
8      John Doe        johndoe@gmail.com      123 Main St      Laptop   

                                   Description  Price  
0                   Noise-canceling headphones    300  
1                   Noise-canceling headphones    300  
2  Premium noise-canceling over-ear headphones    300  
3             5G smartphone with great batter

#### Hash-Based Deduplication


In [11]:
### Hash-Based Deduplication

# Generate unique hash for each row
df['hash'] = df.apply(lambda x: hashlib.md5(str(x.values).encode()).hexdigest(), axis=1)

# Drop duplicates based on hash
print(df['hash'])
df = df.drop_duplicates(subset=['hash']).drop(columns=['hash'])

print(df)


0    5f13addccb7dd3a034fd94a3a8fbbd64
1    5f13addccb7dd3a034fd94a3a8fbbd64
2    5a68b07bd3a5c7867b60fd5193bbb630
3    405403b55dbdd0bc9e357e9b094ffa8a
4    cda41a99f6d90f112a46f48b17c69023
5    5a68b07bd3a5c7867b60fd5193bbb630
6    94e7a1d368797f5a0e060035dd8dc50f
7    22adf030eab26379827de26b35153749
8    22adf030eab26379827de26b35153749
Name: hash, dtype: object
  Customer_Name                    Email          Address     Product  \
0   Alice Brown  alice.brown@outlook.com     789 Pine Ave  Headphones   
2      Alice B.       aliceb@outlook.com  789 Pine Avenue  Headphones   
3    Jane Smith      janesmith@yahoo.com       456 Oak Rd  Smartphone   
4        J. Doe           jdoe@gmail.com  123 Main Street      Laptop   
6       Jane S.         jsmith@yahoo.com     456 Oak Road  Smartphone   
7      John Doe        johndoe@gmail.com      123 Main St      Laptop   

                                   Description  Price  
0                   Noise-canceling headphones    300  
2  Premi

####  Cosine Similarity for Text Deduplication

In [12]:
####  Cosine Similarity for Text Deduplication

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize text descriptions
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["Description"])

# Compute cosine similarity
cos_sim = cosine_similarity(tfidf_matrix)

# Identify duplicate indices (threshold = 0.85)
duplicate_indices = set()
for i in range(len(cos_sim)):
    for j in range(i + 1, len(cos_sim)):
        if cos_sim[i, j] > 0.85:
            duplicate_indices.add(j)

# Drop duplicate descriptions
df = df.drop(list(duplicate_indices))

print(df)


  Customer_Name                    Email          Address     Product  \
0   Alice Brown  alice.brown@outlook.com     789 Pine Ave  Headphones   
2      Alice B.       aliceb@outlook.com  789 Pine Avenue  Headphones   
3    Jane Smith      janesmith@yahoo.com       456 Oak Rd  Smartphone   
4        J. Doe           jdoe@gmail.com  123 Main Street      Laptop   
6       Jane S.         jsmith@yahoo.com     456 Oak Road  Smartphone   
7      John Doe        johndoe@gmail.com      123 Main St      Laptop   

                                   Description  Price  
0                   Noise-canceling headphones    300  
2  Premium noise-canceling over-ear headphones    300  
3             5G smartphone with great battery    700  
4             16GB RAM high-performance laptop   1000  
6       Smartphone with 5G & long battery life    700  
7                Powerful laptop with 16GB RAM   1000  


#####  Clustering-Based Deduplication (DBSCAN)

In [13]:
#####  Clustering-Based Deduplication (DBSCAN)

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode categorical values as numbers
encoder = LabelEncoder()
df["Customer_Encoded"] = encoder.fit_transform(df["Customer_Name"])

# Apply DBSCAN clustering
clustering = DBSCAN(eps=1, min_samples=2).fit(np.array(df["Customer_Encoded"]).reshape(-1, 1))
df["Cluster"] = clustering.labels_

# Remove records that belong to the same cluster
df = df[df["Cluster"] == -1].drop(columns=["Customer_Encoded", "Cluster"])

print(df)



Empty DataFrame
Columns: [Customer_Name, Email, Address, Product, Description, Price]
Index: []
