# 🧹 Data Deduplication and Cleaning Techniques
This notebook covers multiple methods to identify and remove duplicate or similar records using techniques such as hashing, fuzzy matching, cosine similarity, and clustering.

## 📥 Step 1: Load the Dataset

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("complex_sales_data.csv")

# Display basic info
print(df.info())
print(df.head())

## 🔍 Step 2: Check for Duplicates

In [None]:
# Check for different types of duplicates
print("Exact Duplicates:", df.duplicated().sum())
print("Duplicate Order_IDs:", df.duplicated(subset=['Order_ID']).sum())
print("Duplicate Customer Orders:", df.duplicated(subset=['Customer_Name', 'Product_Name', 'Order_Date']).sum())

## 🧽 Step 3: Clean and Standardize Data

In [None]:
# Trim spaces and convert names to lowercase
df["Customer_Name"] = df["Customer_Name"].str.strip().str.lower()

# Standardize date format
df["Order_Date"] = pd.to_datetime(df["Order_Date"], errors="coerce", dayfirst=True)

print(df.head())

## ❌ Step 4: Remove Exact Duplicates

In [None]:
df = df.drop_duplicates()
print("After removing exact duplicates:", df.shape)

## 🔄 Step 5: Merge Partial Duplicates

In [None]:
# Group and aggregate similar records
df = df.groupby(["Customer_Name", "Product_Name", "Order_Date"], as_index=False).agg({
    "Quantity": "sum",
    "Total_Amount": "sum",
    "Price_Per_Unit": "first",
    "Order_ID": "first"
})
print("After merging partial duplicates:", df.shape)

## 🤖 Step 6: Fuzzy Matching for Similar Names

In [None]:
from fuzzywuzzy import fuzz, process

# Function to find and replace similar names
def standardize_names(df, threshold=85):
    unique_names = df["Customer_Name"].unique()
    name_mapping = {}
    for name in unique_names:
        match = process.extractOne(name, unique_names, scorer=fuzz.token_sort_ratio)
        if match[1] >= threshold and match[0] != name:
            name_mapping[name] = match[0]
    df["Customer_Name"] = df["Customer_Name"].replace(name_mapping)
    return df

# Apply fuzzy deduplication
df = standardize_names(df)
print("After fixing fuzzy duplicates:", df.shape)

## 💾 Step 7: Save the Cleaned Data

In [None]:
df.to_csv("cleaned_sales_data.csv", index=False)
print("Cleaned data saved successfully!")