In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("complex_sales_data.csv")

# Display basic info
print(df.info())
print(df.head())

# Check for duplicate records
print("Exact Duplicates:", df.duplicated().sum())
print("Duplicate Order_IDs:", df.duplicated(subset=['Order_ID']).sum())
print("Duplicate Customer Orders:", df.duplicated(subset=['Customer_Name', 'Product_Name', 'Order_Date']).sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Order_ID          10 non-null     object 
 1   Customer_Name     10 non-null     object 
 2   Product_Category  10 non-null     object 
 3   Product_Name      10 non-null     object 
 4   Quantity          10 non-null     int64  
 5   Price_Per_Unit    10 non-null     float64
 6   Total_Amount      10 non-null     float64
 7   Order_Date        10 non-null     object 
dtypes: float64(2), int64(1), object(5)
memory usage: 772.0+ bytes
None
   Order_ID       Customer_Name Product_Category Product_Name  Quantity  \
0  0844e15d      Jonathan Perez      Electronics   Smartphone         2   
1  b3472ab1      William Nelson        Groceries         Eggs         5   
2  d1dbe108     Lawrence Arnold        Groceries        Bread         2   
3  42ec140d  Dr. Robert Johnson           Beauty    Sunsc

In [2]:
# Trim spaces and convert names to lowercase
df["Customer_Name"] = df["Customer_Name"].str.strip().str.lower()

# Standardize date format
df["Order_Date"] = pd.to_datetime(df["Order_Date"], errors="coerce", dayfirst=True)  # Handle different formats

# Display cleaned data
print(df.head())


   Order_ID       Customer_Name Product_Category Product_Name  Quantity  \
0  0844e15d      jonathan perez      Electronics   Smartphone         2   
1  b3472ab1      william nelson        Groceries         Eggs         5   
2  d1dbe108     lawrence arnold        Groceries        Bread         2   
3  42ec140d  dr. robert johnson           Beauty    Sunscreen         2   
4  c8266aea           juan snow             Toys         Doll         8   

   Price_Per_Unit  Total_Amount Order_Date  
0          435.64        871.28 2024-11-24  
1          132.37        661.85 2023-10-29  
2          237.93        475.86 2024-05-15  
3          249.63        499.26 2023-05-31  
4          450.45       3603.60 2024-05-17  


  df["Order_Date"] = pd.to_datetime(df["Order_Date"], errors="coerce", dayfirst=True)  # Handle different formats


In [3]:
df = df.drop_duplicates()
print("After removing exact duplicates:", df.shape)


After removing exact duplicates: (9, 8)


In [4]:
df = df.groupby(["Customer_Name", "Product_Name", "Order_Date"], as_index=False).agg({
    "Quantity": "sum",
    "Total_Amount": "sum",
    "Price_Per_Unit": "first",  # Assuming price remains the same
    "Order_ID": "first"  # Keep one order ID
})
print("After merging partial duplicates:", df.shape)


After merging partial duplicates: (7, 7)


In [6]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from fuzzywuzzy import fuzz, process

# Function to find and replace similar names
def standardize_names(df, threshold=85):
    unique_names = df["Customer_Name"].unique()
    name_mapping = {}

    for name in unique_names:
        match = process.extractOne(name, unique_names, scorer=fuzz.token_sort_ratio)
        if match[1] >= threshold and match[0] != name:
            name_mapping[name] = match[0]

    df["Customer_Name"] = df["Customer_Name"].replace(name_mapping)
    return df

# Apply fuzzy deduplication
df = standardize_names(df)
print("After fixing fuzzy duplicates:", df.shape)


After fixing fuzzy duplicates: (7, 7)




In [8]:
df.to_csv("cleaned_sales_data.csv", index=False)
print("Cleaned data saved successfully!")


Cleaned data saved successfully!
