In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------------------------------------
# Step 1: Import Libraries
# -------------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display all columns
pd.set_option('display.max_columns', None)
# -------------------------------------------------------------
# Step 2: Load Dataset
# -------------------------------------------------------------
file_path = 'dirty_cafe_sales.csv'  # Change path if needed
df = pd.read_csv(file_path)

# -------------------------------------------------------------
# Step 3: Dataset Overview
# -------------------------------------------------------------
print("===== Dataset Overview =====")
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())
print("\nUnique Values per Column:\n", df.nunique())

print("\n===== First 5 Rows =====")
display(df.head())
print("\n===== Last 5 Rows =====")
display(df.tail())

# -------------------------------------------------------------
# Step 4: Data Quality Checks
# -------------------------------------------------------------
print("\n===== Data Quality Checks =====")
# Check for placeholders like 'UNKNOWN', 'ERROR', etc.
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"\nUnique values in {col}:")
        print(df[col].unique())

# -------------------------------------------------------------
# Step 5: Data Cleaning
# -------------------------------------------------------------
# Replace placeholders
placeholders = ['UNKNOWN', 'ERROR', 'NaN', 'None', ' ']
df.replace(placeholders, np.nan, inplace=True)

# Convert numeric columns
for col in ['Quantity', 'Price Per Unit', 'Total Spent']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing values
# Impute numeric columns with median, categorical with mode
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Remove duplicates if any
df.drop_duplicates(inplace=True)

# -------------------------------------------------------------
# Step 6: Descriptive Statistics
# -------------------------------------------------------------
print("\n===== Descriptive Statistics =====")
print(df.describe(include='all'))

# -------------------------------------------------------------
# Step 7: Outlier Detection
# -------------------------------------------------------------
plt.figure(figsize=(12, 5))
sns.boxplot(data=df[['Quantity', 'Price Per Unit', 'Total Spent']])
plt.title('Boxplot for Outlier Detection')
plt.show()

# Using IQR method to detect outliers
Q1 = df[['Quantity', 'Price Per Unit', 'Total Spent']].quantile(0.25)
Q3 = df[['Quantity', 'Price Per Unit', 'Total Spent']].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df[['Quantity', 'Price Per Unit', 'Total Spent']] < (Q1 - 1.5 * IQR)) | (df[['Quantity', 'Price Per Unit', 'Total Spent']] > (Q3 + 1.5 * IQR))).sum()
print("\nOutliers detected per column:\n", outliers)

# -------------------------------------------------------------
# Step 8: Data Visualization
# -------------------------------------------------------------
# Univariate Analysis
plt.figure(figsize=(14, 5))
plt.subplot(1, 3, 1)
sns.histplot(df['Quantity'], kde=True)
plt.title('Quantity Distribution')

plt.subplot(1, 3, 2)
sns.histplot(df['Price Per Unit'], kde=True)
plt.title('Price Per Unit Distribution')

plt.subplot(1, 3, 3)
sns.histplot(df['Total Spent'], kde=True)
plt.title('Total Spent Distribution')
plt.show()

# Bivariate Analysis
plt.figure(figsize=(6, 5))
sns.scatterplot(data=df, x='Price Per Unit', y='Total Spent', hue='Item')
plt.title('Price vs Total Spent by Item')
plt.show()

# Correlation Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# -------------------------------------------------------------
# Step 9: Insights & Interpretation
# -------------------------------------------------------------
print("\n===== Insights & Interpretation =====")
print("1️⃣ Most frequent items sold:")
print(df['Item'].value_counts().head())
print("\n2️⃣ Popular payment methods:")
print(df['Payment Method'].value_counts())
print("\n3️⃣ Average sales performance:")
print(df.groupby('Item')['Total Spent'].mean().sort_values(ascending=False))

print("\n✅ EDA Completed Successfully!")