In [1]:
# Cell 1: Import Libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

print("Current Working Directory:", os.getcwd())

# Cell 2: Load the Raw Dataset
# Use a raw string (r'...') to avoid invalid escape sequences.
data = pd.read_csv(r'../data/Supermart_Grocery_Sales.csv')
print("✅ Raw Data Overview:")
print(data.head())

# Cell 3: Standardize Column Names (remove extra spaces and commas)
data.columns = data.columns.str.strip().str.replace(' ', '_').str.replace(',', '')
print("\n✅ Standardized Columns:")
print(data.columns.tolist())

# Cell 4: Convert 'Order_Date' to datetime format
data['Order_Date'] = pd.to_datetime(data['Order_Date'], errors='coerce', dayfirst=True)
data.dropna(subset=['Order_Date'], inplace=True)  # drop rows with invalid dates

# Cell 5: Extract Month (full name) and Year
data['Month'] = data['Order_Date'].dt.strftime('%B')  # e.g., August, December
data['Order_Year'] = data['Order_Date'].dt.year.astype('Int64')
print("\n✅ After Date Extraction:")
print(data[['Order_Date', 'Month', 'Order_Year']].head())

# Cell 6: Handle Missing Values in Critical Columns
data.dropna(subset=['Order_ID', 'Customer_Name', 'Sales'], inplace=True)

# Cell 7: Encode Categorical Variables (except Month which is used as label for display)
categorical_cols = ['Category', 'Sub_Category', 'City', 'Region', 'State']
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Cell 8: Save the Cleaned Data
data.to_csv('../data/cleaned_supermart_data.csv', index=False)
print("\n✅ Data Preprocessing Completed Successfully! Cleaned data saved to '../data/cleaned_supermart_data.csv'")


Current Working Directory: d:\project1\Supermart_Analytics_Project\notebooks
✅ Raw Data Overview:
  Order ID Customer Name          Category      Sub Category         City  \
0      OD1        Harish      Oil & Masala           Masalas      Vellore   
1      OD2         Sudha         Beverages     Health Drinks  Krishnagiri   
2      OD3       Hussain       Food Grains      Atta & Flour   Perambalur   
3      OD4       Jackson  Fruits & Veggies  Fresh Vegetables   Dharmapuri   
4      OD5       Ridhesh       Food Grains   Organic Staples         Ooty   

   Order Date Region  Sales  Discount  Profit       State  
0  11-08-2017  North   1254      0.12  401.28  Tamil Nadu  
1  11-08-2017  South    749      0.18  149.80  Tamil Nadu  
2  06-12-2017   West   2360      0.21  165.20  Tamil Nadu  
3  10-11-2016  South    896      0.25   89.60  Tamil Nadu  
4  10-11-2016  South   2355      0.26  918.45  Tamil Nadu  

✅ Standardized Columns:
['Order_ID', 'Customer_Name', 'Category', 'Sub_Categor