## PHASE 2

In [1]:
import pandas as pd

# Task 1: Load the dataset
df = pd.read_csv('supply_chain_messy_data.csv')
print("Dataset Loaded Successfully.")

Dataset Loaded Successfully.


In [2]:
# Task 1. Remove Duplicates First (Crucial Step)
# We remove duplicates BEFORE filling missing values to avoid skewing the stats.
df.drop_duplicates(subset='Transaction_ID', keep='first', inplace=True)
print(f"Duplicates removed. New shape: {df.shape}")


Duplicates removed. New shape: (5000, 8)


In [3]:
# Task 2. Handling Missing Data

# STRATEGY 1: Median Imputation (For Unit_Price)
# We use Median because prices can have outliers (expensive laptops vs cheap cables).
price_median = df['Unit_Price'].median()
df['Unit_Price'] = df['Unit_Price'].fillna(price_median)
print(f"Filled missing Unit_Price with Median: ${price_median}")

# STRATEGY 2: Mean Imputation (For Customer_Rating)
# We use Mean to maintain the overall average satisfaction score.
rating_mean = round(df['Customer_Rating'].mean(), 1)
df['Customer_Rating'] = df['Customer_Rating'].fillna(rating_mean)
print(f"Filled missing Customer_Rating with Mean: {rating_mean}")

# STRATEGY 3: Mode Imputation (For Shipping_Method)
# Since this is text (categorical), we must fill with the most frequent value.
shipping_mode = df['Shipping_Method'].mode()[0]
df['Shipping_Method'] = df['Shipping_Method'].fillna(shipping_mode)
print(f"Filled missing Shipping_Method with Mode: {shipping_mode}")

# Verify Cleanliness
print("\n--- Remaining Null Values ---")
print(df.isnull().sum())

Filled missing Unit_Price with Median: $1021.95
Filled missing Customer_Rating with Mean: 3.0
Filled missing Shipping_Method with Mode: Overnight

--- Remaining Null Values ---
Transaction_ID      0
Date                0
Product_Category    0
Unit_Price          0
Quantity            0
Shipping_Cost       0
Shipping_Method     0
Customer_Rating     0
dtype: int64


In [4]:
# Task 3. Fix Inconsistencies in Text

#  Step 1.Standardize Case (Title Case)
# Changes 'smartphone', 'SMARTPHONE' -> 'Smartphone'
df['Product_Category'] = df['Product_Category'].str.title() # Cleaning Phase Text Standardization
df['Shipping_Method'] = df['Shipping_Method'].str.title()

# Step 2: Remove Whitespace
# Removes hidden spaces like "Laptop " -> "Laptop"
df['Product_Category'] = df['Product_Category'].str.strip()
df['Shipping_Method'] = df['Shipping_Method'].str.strip()

print("Text columns standardized to Title Case and stripped of whitespace.\n")


Text columns standardized to Title Case and stripped of whitespace.



In [5]:
# Task 4. Convert Data Types

# Cleaning Phase Data Types
print("---- Original Types: ----\n", df.dtypes)

#  Step 1. Convert Date Column to DateTime
# This allows us to extract Month, Year, or Day later.
df['Date'] = pd.to_datetime(df['Date'])

# Step 2. Convert Text Columns to Category
# This reduces memory usage and sets them up for machine learning.
cols_to_category = ['Product_Category', 'Shipping_Method']
for col in cols_to_category:
    df[col] = df[col].astype('category')

print("\n--- New Data Types ---")
print(df.info())

---- Original Types: ----
 Transaction_ID       object
Date                 object
Product_Category     object
Unit_Price          float64
Quantity              int64
Shipping_Cost       float64
Shipping_Method      object
Customer_Rating     float64
dtype: object

--- New Data Types ---
<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Transaction_ID    5000 non-null   object        
 1   Date              5000 non-null   datetime64[ns]
 2   Product_Category  5000 non-null   category      
 3   Unit_Price        5000 non-null   float64       
 4   Quantity          5000 non-null   int64         
 5   Shipping_Cost     5000 non-null   float64       
 6   Shipping_Method   5000 non-null   category      
 7   Customer_Rating   5000 non-null   float64       
dtypes: category(2), datetime64[ns](1), float64(3), int64(1), object(1

### Non-Visual Bivariate Analysis

In [6]:
# 1. Categorical vs Categorical: Cross-Tabulation
# Comparison: Product Category vs. Shipping Method
# Goal: See if specific products are shipped differently.
 
cat_vs_cat = pd.crosstab(df['Shipping_Method'],df['Product_Category'])
print(cat_vs_cat)

Product_Category  Headphones  Laptop  Monitor  Smartphone  Smartwatch  Tablet
Shipping_Method                                                              
Express                  198     203      185         200         212     191
International            206     192      193         205         198     168
Overnight                246     266      206         262         247     246
Standard Ground          197     206      174         216         188     195


In [7]:
# 2. Categorical vs Numerical: GroupBy Aggregation
# Comparison: Product Category vs. Unit Price & Customer Rating
# Goal: Find the average price and average rating for each product type.

cat_vs_num = df.groupby('Product_Category',observed=True)[['Unit_Price', 'Customer_Rating']].mean().round(2)
print(cat_vs_num)

                  Unit_Price  Customer_Rating
Product_Category                             
Headphones           1026.82             3.06
Laptop               1015.85             2.90
Monitor              1014.73             2.96
Smartphone           1016.86             2.93
Smartwatch           1007.53             3.05
Tablet               1037.75             2.94


In [8]:

# 3. Numerical vs Numerical : Correlation Matrix 
# Comparison: Correlation Matrix
# Goal: Check if Unit Price, Quantity, and Shipping Cost are related.
# We select only the numerical columns relevant to the analysis

num_vs_num = df[['Unit_Price', 'Quantity', 'Shipping_Cost', 'Customer_Rating']].corr().round(2)
print(num_vs_num)

                 Unit_Price  Quantity  Shipping_Cost  Customer_Rating
Unit_Price             1.00      0.00          -0.01             0.03
Quantity               0.00      1.00          -0.01            -0.01
Shipping_Cost         -0.01     -0.01           1.00             0.02
Customer_Rating        0.03     -0.01           0.02             1.00
