#  Data Generation $\rightarrow$ Inspection $\rightarrow$ Cleaning $\rightarrow$ Analysis.


## Creating Dataset & Converting into CSV File

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# 1. Setup
rows = 5000
categories = ['Smartphone', 'Laptop', 'Headphones', 'Smartwatch', 'Tablet', 'Monitor']
shipping_methods = ['Standard Ground', 'Express', 'Overnight', 'International']

# 2. Generate Data
data = {
    'Transaction_ID': [f'TRX-{10000+i}' for i in range(rows)],
    'Date': [datetime(2023, 1, 1) + timedelta(days=random.randint(0, 365)) for _ in range(rows)],
    'Product_Category': [random.choice(categories) for _ in range(rows)],
    'Unit_Price': [round(random.uniform(50.0, 2000.0), 2) for _ in range(rows)],
    'Quantity': [random.randint(1, 50) for _ in range(rows)],
    'Shipping_Cost': [round(random.uniform(5.0, 50.0), 2) for _ in range(rows)],
    'Shipping_Method': [random.choice(shipping_methods) for _ in range(rows)],
    'Customer_Rating': [random.randint(1, 5) for _ in range(rows)]
}

df = pd.DataFrame(data)

# 3. Make it "Dirty" (Add Nulls & Duplicates)–
df.loc[df.sample(frac=0.1).index, 'Customer_Rating'] = np.nan  # 10% Missing Ratings
df.loc[df.sample(frac=0.05).index, 'Shipping_Method'] = np.nan # 5% Missing Ship Method
df.loc[df.sample(frac=0.02).index, 'Unit_Price'] = np.nan      # 2% Missing Price
df = pd.concat([df, df.sample(n=200)], ignore_index=True)      # Add 200 Duplicates

# 4. Save to your computer
df.to_csv('supply_chain_messy_data.csv', index=False)
print("File saved as 'supply_chain_messy_data.csv'")

File saved as 'supply_chain_messy_data.csv'


In [8]:
df

Unnamed: 0,Transaction_ID,Date,Product_Category,Unit_Price,Quantity,Shipping_Cost,Shipping_Method,Customer_Rating
0,TRX-10000,2023-02-22,Smartphone,1407.02,48,21.55,Express,2.0
1,TRX-10001,2023-03-21,Smartphone,777.81,7,48.89,Overnight,3.0
2,TRX-10002,2023-11-08,Headphones,78.68,17,28.87,International,5.0
3,TRX-10003,2023-06-20,Laptop,1380.58,2,40.11,Overnight,
4,TRX-10004,2023-02-17,Monitor,841.01,12,46.92,Standard Ground,
...,...,...,...,...,...,...,...,...
5195,TRX-14945,2023-06-06,Smartphone,409.87,9,17.31,Express,
5196,TRX-14267,2023-04-09,Headphones,1826.91,18,49.76,International,5.0
5197,TRX-11598,2023-01-18,Smartphone,740.31,25,48.87,Standard Ground,5.0
5198,TRX-14415,2023-11-03,Smartwatch,76.98,39,25.91,Overnight,2.0


## PHASE 1

In [2]:
import pandas as pd

# Task 1: Load the dataset
df = pd.read_csv('supply_chain_messy_data.csv')
print("Dataset Loaded Successfully.")


Dataset Loaded Successfully.


In [3]:
# Task 2: View sample data
print(df.head()) # prints first five rows

  Transaction_ID        Date Product_Category  Unit_Price  Quantity  \
0      TRX-10000  2023-02-22       Smartphone     1407.02        48   
1      TRX-10001  2023-03-21       Smartphone      777.81         7   
2      TRX-10002  2023-11-08       Headphones       78.68        17   
3      TRX-10003  2023-06-20           Laptop     1380.58         2   
4      TRX-10004  2023-02-17          Monitor      841.01        12   

   Shipping_Cost  Shipping_Method  Customer_Rating  
0          21.55          Express              2.0  
1          48.89        Overnight              3.0  
2          28.87    International              5.0  
3          40.11        Overnight              NaN  
4          46.92  Standard Ground              NaN  


In [4]:
# Task 3: Check structure
print(df.info()) #Dataset Info (Structure & Types)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction_ID    5200 non-null   object 
 1   Date              5200 non-null   object 
 2   Product_Category  5200 non-null   object 
 3   Unit_Price        5095 non-null   float64
 4   Quantity          5200 non-null   int64  
 5   Shipping_Cost     5200 non-null   float64
 6   Shipping_Method   4941 non-null   object 
 7   Customer_Rating   4680 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 325.1+ KB
None


In [5]:
# Task 4: Get summary stats
print(df.describe()) #Statistical Summary (Numerical)

        Unit_Price     Quantity  Shipping_Cost  Customer_Rating
count  5095.000000  5200.000000    5200.000000      4680.000000
mean   1027.438002    25.513462      27.785460         2.976282
std     562.611045    14.471447      12.893202         1.415148
min      50.100000     1.000000       5.010000         1.000000
25%     536.410000    13.000000      16.597500         2.000000
50%    1028.740000    25.000000      27.950000         3.000000
75%    1514.840000    38.000000      39.000000         4.000000
max    1999.890000    50.000000      50.000000         5.000000


In [6]:
# Task 5: Find missing values 
print(df.isnull().sum()) # Missing Values Count 

Transaction_ID        0
Date                  0
Product_Category      0
Unit_Price          105
Quantity              0
Shipping_Cost         0
Shipping_Method     259
Customer_Rating     520
dtype: int64


In [7]:
# Task 6: Understand categories
print("--- Top Product Categories ---")
print(df['Product_Category'].value_counts())
print("\n")
print("--- Shipping Method Distribution ---")
print(df['Shipping_Method'].value_counts(dropna=False))

--- Top Product Categories ---
Product_Category
Tablet        894
Smartphone    886
Smartwatch    871
Monitor       863
Laptop        862
Headphones    824
Name: count, dtype: int64


--- Shipping Method Distribution ---
Shipping_Method
International      1290
Express            1236
Standard Ground    1233
Overnight          1182
NaN                 259
Name: count, dtype: int64
