In [None]:
#import libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Load cleaned data
df = pd.read_csv('amazon_sales_cleaned.csv')
df['order_date'] = pd.to_datetime(df['order_date'])


# 1. Descriptive Statistics
# ------------------------------
print("Descriptive statistics for numeric columns:\n")
display(df.describe())

In [None]:
# 2. Temporal Trends 

# Daily total revenue
daily_revenue = df.groupby('order_date')['total_revenue'].sum().reset_index()
plt.figure(figsize=(12,4))
plt.plot(daily_revenue['order_date'], daily_revenue['total_revenue'])
plt.title('Daily Total Revenue Over Time')
plt.xlabel('Date')
plt.ylabel('Revenue ($)')
plt.grid(True)
plt.show()

# Monthly aggregated sales by category
df['year_month'] = df['order_date'].dt.to_period('M')
monthly_category = df.groupby(['year_month', 'product_category'])['total_revenue'].sum().unstack().fillna(0)
monthly_category.plot(figsize=(14,6), marker='o')
plt.title('Monthly Revenue by Product Category')
plt.ylabel('Revenue')
plt.legend(title='Category')
plt.show()

# Day-of-week analysis
dow_revenue = df.groupby('day_of_week')['total_revenue'].mean()
plt.figure(figsize=(8,4))
dow_revenue.plot(kind='bar')
plt.xticks(ticks=range(7), labels=['Mon','Tue','Wed','Thu','Fri','Sat','Sun'])
plt.title('Average Revenue by Day of Week')
plt.ylabel('Avg Revenue ($)')
plt.show()



In [None]:
# 3. Regional Comparisons 

region_stats = df.groupby('customer_region').agg({
    'total_revenue': 'sum',
    'quantity_sold': 'sum',
    'rating': 'mean',
    'discount_percent': 'mean'
}).round(2).sort_values('total_revenue', ascending=False)
print("\nRegional Sales Summary:\n", region_stats)

# Revenue share by region
plt.figure(figsize=(8,8))
df.groupby('customer_region')['total_revenue'].sum().plot(kind='pie', autopct='%1.1f%%')
plt.title('Revenue Share by Region')
plt.ylabel('')
plt.show()


In [None]:
# 4. Correlation Analysis 

numeric_cols = ['price', 'discount_percent', 'quantity_sold', 'rating', 'review_count', 
                'discounted_price', 'total_revenue', 'year', 'month', 'day_of_week']
corr = df[numeric_cols].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Feature Correlation Matrix')
plt.show()

# Observations: total_revenue strongly correlated with quantity_sold and price; rating weakly correlated with others.




In [None]:
# 5. Dimensionality Reduction (PCA) 

# Scale numeric features for PCA
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['price', 'discount_percent', 'quantity_sold', 'rating', 'review_count']])

pca = PCA()
pca_result = pca.fit_transform(scaled_features)

# Explained variance
plt.figure(figsize=(8,4))
plt.plot(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_.cumsum(), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()
print("Explained variance ratio per PC:", pca.explained_variance_ratio_)

# First two PCs
df['PC1'] = pca_result[:,0]
df['PC2'] = pca_result[:,1]
plt.figure(figsize=(8,6))
sns.scatterplot(x='PC1', y='PC2', hue='product_category', data=df, alpha=0.6)
plt.title('PCA Projection (First 2 Components)')
plt.show()


In [None]:
# 6. Clustering Analysis (K-Means) 

# Use scaled features for clustering
inertia = []
silhouette_scores = []
K_range = range(2,10)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(scaled_features)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(scaled_features, labels))

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(K_range, inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')

plt.subplot(1,2,2)
plt.plot(K_range, silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')
plt.show()
# Choose k=3 (example) based on elbow and silhouette.

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(scaled_features)

# Profile clusters
cluster_profile = df.groupby('cluster')[['price','discount_percent','quantity_sold','rating','total_revenue']].mean()
print("\nCluster Profiles:\n", cluster_profile)

# Business interpretation: e.g., cluster 0 = high-price high-revenue, cluster 1 = low-price low-revenue, etc.

