In [4]:
# sales_analysis.ipynb - CLEAN VERSION

"""
SALES DATA ANALYSIS PROJECT
================================
Python Lead (Data Engineer)
Python Analyst (KPI Calculator)
"""

# ============================================
# DATA ENGINEER - DATA CLEANING
# ============================================

import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("DATA ENGINEER - DATA CLEANING PROCESS")
print("=" * 60)
print("\n")

# -----------------------------------------------------------------
# 1. IMPORT THE DATASET
# -----------------------------------------------------------------
print("1. IMPORTING DATASET...")
print("-" * 40)

# Load the CSV file using the correct path
df = pd.read_csv(r'C:\Users\CAPACITI-JHB\OneDrive\Desktop\Sales-data-analysis\Sales_Data_Analysis_Project\data\Week-2-Sales-Data.csv')

# Display basic information about the dataset
print(f"Dataset imported successfully!")
print(f"Shape of dataset: {df.shape}")  # (rows, columns)
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"\nFirst 5 rows of the dataset:")
print(df.head())
print("\n")

# -----------------------------------------------------------------
# 2. INITIAL DATA EXPLORATION
# -----------------------------------------------------------------
print("2. INITIAL DATA EXPLORATION...")
print("-" * 40)

# Display column names and data types
print("Column Information:")
print(df.info())
print("\n")

# Display basic statistics for numerical columns
print("Statistical Summary:")
print(df.describe())
print("\n")

# Display unique values in categorical columns
print("Unique values in categorical columns:")
print(f"Products: {df['Product'].unique()}")
print(f"Regions: {df['Region'].unique()}")
print(f"Sales Representatives: {df['Sales_Rep'].unique()}")
print(f"Number of unique orders: {df['Order_ID'].nunique()}")
print("\n")

# -----------------------------------------------------------------
# 3. CHECK FOR DATA QUALITY ISSUES
# -----------------------------------------------------------------
print("3. CHECKING FOR DATA QUALITY ISSUES...")
print("-" * 40)

# 3.1 Check for missing values
print("3.1 Missing Values Check:")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_report = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})
print(missing_report[missing_report['Missing Values'] > 0])
if missing_report[missing_report['Missing Values'] > 0].empty:
    print("✓ No missing values found!")
print("\n")

# 3.2 Check for duplicates
print("3.2 Duplicates Check:")
duplicate_rows = df.duplicated().sum()
duplicate_order_ids = df['Order_ID'].duplicated().sum()
print(f"Total duplicate rows: {duplicate_rows}")
print(f"Duplicate Order IDs: {duplicate_order_ids}")
if duplicate_rows == 0 and duplicate_order_ids == 0:
    print("✓ No duplicates found!")
print("\n")

# 3.3 Check data types
print("3.3 Data Types Check:")
print(df.dtypes)
print("\n")

# 3.4 Check for potential data inconsistencies
print("3.4 Data Consistency Checks:")
print("Checking Revenue calculation consistency...")
# Calculate expected revenue from Units_Sold * Unit_Price
df['Calculated_Revenue'] = df['Units_Sold'] * df['Unit_Price']

# Compare with existing Revenue column
revenue_discrepancies = df[df['Revenue'] != df['Calculated_Revenue']]
print(f"Revenue discrepancies found: {len(revenue_discrepancies)}")
if len(revenue_discrepancies) == 0:
    print("✓ All revenue calculations are consistent!")
else:
    print("\nRevenue discrepancies:")
    print(revenue_discrepancies[['Order_ID', 'Units_Sold', 'Unit_Price', 'Revenue', 'Calculated_Revenue']])
print("\n")

# -----------------------------------------------------------------
# 4. DATA CLEANING PROCESS
# -----------------------------------------------------------------
print("4. PERFORMING DATA CLEANING...")
print("-" * 40)

# Create a copy of the original dataframe for cleaning
df_clean = df.copy()

# 4.1 Remove the temporary calculated column
df_clean = df_clean.drop('Calculated_Revenue', axis=1)
print("4.1 Removed temporary calculated column")
print("\n")

# 4.2 Convert Order_Date to proper datetime format
print("4.2 Converting Order_Date to datetime format...")
df_clean['Order_Date'] = pd.to_datetime(df_clean['Order_Date'])
print(f"✓ Order_Date converted to datetime: {df_clean['Order_Date'].dtype}")
print(f"Date range: {df_clean['Order_Date'].min()} to {df_clean['Order_Date'].max()}")
print("\n")

# 4.3 Check for and handle any missing values (if any were found)
# Since we found no missing values, this is just for demonstration
print("4.3 Missing value handling (demonstration):")
if df_clean.isnull().sum().sum() > 0:
    print("Handling missing values...")
    # For numerical columns, fill with median
    # For categorical columns, fill with mode
    numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    
    for col in numerical_cols:
        if df_clean[col].isnull().sum() > 0:
            df_clean[col].fillna(df_clean[col].median(), inplace=True)
    
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)
    print("Missing values handled.")
else:
    print("✓ No missing values to handle!")
print("\n")

# 4.4 Remove duplicates (if any were found)
print("4.4 Duplicate handling (demonstration):")
initial_rows = len(df_clean)
df_clean = df_clean.drop_duplicates()
final_rows = len(df_clean)
duplicates_removed = initial_rows - final_rows

if duplicates_removed > 0:
    print(f"Removed {duplicates_removed} duplicate rows")
else:
    print("✓ No duplicates to remove!")
print("\n")

# 4.5 Validate data types
print("4.5 Final data type validation:")
print(df_clean.dtypes)
print("\n")

# 4.6 Check for outliers in numerical columns
print("4.6 Outlier detection (summary):")
numerical_cols = ['Units_Sold', 'Unit_Price', 'Revenue']
for col in numerical_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)]
    print(f"{col}: {len(outliers)} potential outliers detected")
print("\n")

# -----------------------------------------------------------------
# 5. FINAL CLEANED DATASET SUMMARY
# -----------------------------------------------------------------
print("5. CLEANED DATASET SUMMARY")
print("-" * 40)
print(f"Final shape: {df_clean.shape}")
print(f"\nFirst 3 rows of cleaned dataset:")
print(df_clean.head(3))
print(f"\nData types after cleaning:")
print(df_clean.dtypes)
print("\n" + "=" * 60)
print("DATA CLEANING COMPLETED SUCCESSFULLY!")
print("=" * 60 + "\n\n")

# ============================================
# PYTHON ANALYST - KPI CALCULATIONS
# ============================================

print("=" * 60)
print("PYTHON ANALYST - KPI CALCULATIONS")
print("=" * 60)
print("\n")

# -----------------------------------------------------------------
# 1. SETUP FOR ANALYSIS
# -----------------------------------------------------------------
# We'll use the cleaned dataframe for all calculations
df_analysis = df_clean.copy()

# Add month and year columns for time-based analysis
df_analysis['Order_Month'] = df_analysis['Order_Date'].dt.month
df_analysis['Order_Year'] = df_analysis['Order_Date'].dt.year
df_analysis['Month_Name'] = df_analysis['Order_Date'].dt.strftime('%B')

print("Added time-based columns for analysis:")
print(df_analysis[['Order_Date', 'Order_Month', 'Month_Name', 'Order_Year']].head())
print("\n")

# -----------------------------------------------------------------
# 2. CALCULATE KEY PERFORMANCE INDICATORS (KPIs)
# -----------------------------------------------------------------
print("2. CALCULATING KEY PERFORMANCE INDICATORS")
print("-" * 40)
print("\n")

# KPI 1: Total revenue for the entire dataset
print("KPI 1: TOTAL REVENUE")
print("-" * 20)
total_revenue = df_analysis['Revenue'].sum()
print(f"Total Revenue: R {total_revenue:,.2f}")
print("\n")

# KPI 2: Average units sold per order
print("KPI 2: AVERAGE UNITS SOLD PER ORDER")
print("-" * 20)
avg_units_sold = df_analysis['Units_Sold'].mean()
print(f"Average Units Sold per Order: {avg_units_sold:.2f}")
print("\n")

# KPI 3: Total revenue per region
print("KPI 3: TOTAL REVENUE PER REGION")
print("-" * 20)
revenue_by_region = df_analysis.groupby('Region')['Revenue'].sum().sort_values(ascending=False)
print("Revenue by Region (Descending Order):")
for region, revenue in revenue_by_region.items():
    print(f"  {region}: R {revenue:,.2f}")
print("\n")

# KPI 4: Highest revenue-generating sales representative
print("KPI 4: HIGHEST REVENUE-GENERATING SALES REPRESENTATIVE")
print("-" * 20)
revenue_by_rep = df_analysis.groupby('Sales_Rep')['Revenue'].sum().sort_values(ascending=False)
print("Top 5 Sales Representatives by Revenue:")
for i, (rep, revenue) in enumerate(revenue_by_rep.head().items(), 1):
    print(f"  {i}. {rep}: R {revenue:,.2f}")

# Get the top rep
top_rep = revenue_by_rep.index[0]
top_rep_revenue = revenue_by_rep.iloc[0]
print(f"\n Highest Revenue Generating Sales Rep: {top_rep}")
print(f"   Total Revenue Generated: R {top_rep_revenue:,.2f}")
print("\n")

# KPI 5: Top 3 products by total units sold
print("KPI 5: TOP 3 PRODUCTS BY TOTAL UNITS SOLD")
print("-" * 20)
units_by_product = df_analysis.groupby('Product')['Units_Sold'].sum().sort_values(ascending=False)
top_3_products = units_by_product.head(3)
print("Top 3 Products by Units Sold:")
for i, (product, units) in enumerate(top_3_products.items(), 1):
    print(f"  {i}. {product}: {units:,} units")
print("\n")

# -----------------------------------------------------------------
# 3. EXPORT RESULTS
# -----------------------------------------------------------------
print("3. EXPORTING RESULTS")
print("-" * 40)

# Create DataFrames for export
kpi_summary = pd.DataFrame({
    'KPI': ['Total Revenue', 'Average Units per Order', 'Top Sales Rep', 'Top Product'],
    'Value': [
        f"R {total_revenue:,.2f}",
        f"{avg_units_sold:.2f}",
        top_rep,
        top_3_products.index[0]
    ]
})

# Export to CSV
kpi_summary.to_csv('kpi_summary.csv', index=False)
pd.DataFrame(revenue_by_region).to_csv('revenue_by_region.csv')
pd.DataFrame(units_by_product).to_csv('units_by_product.csv')

print("✓ Files exported successfully!")
print("\n" + "=" * 60)
print("ANALYSIS COMPLETED!")
print("=" * 60)

DATA ENGINEER - DATA CLEANING PROCESS


1. IMPORTING DATASET...
----------------------------------------
Dataset imported successfully!
Shape of dataset: (100, 8)
Number of rows: 100
Number of columns: 8

First 5 rows of the dataset:
  Order_ID              Product         Region  Units_Sold  Unit_Price  \
0   ORD001              Printer        Limpopo          45        2985   
1   ORD002           Headphones   Western Cape          16       15076   
2   ORD003               Laptop   Western Cape          45       14860   
3   ORD004  External Hard Drive  KwaZulu-Natal          21       16237   
4   ORD005           Smartphone   Western Cape          41        9420   

   Revenue Sales_Rep  Order_Date  
0   134325     Rep-2  2024-03-28  
1   241216    Rep-18  2024-04-11  
2   668700    Rep-16  2024-05-18  
3   340977     Rep-3  2024-05-16  
4   386220    Rep-17  2024-02-21  


2. INITIAL DATA EXPLORATION...
----------------------------------------
Column Information:
<class 'pandas.co