# Fashion Transactions 2025 - Exploratory Data Analysis (EDA)
This notebook performs basic EDA on the combined dataset (55,000 rows).


In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

In [3]:
# Load the data (you can concatenate all 11 CSVs or load one for testing)
df = pd.read_csv('fashion_transactions_2025.csv')
df.head()

Unnamed: 0,order_id,order_date,customer_id,age,gender,city,category,subcategory,brand,price,quantity,discount,final_price,channel,return_flag,rating
0,ORD00001,20-04-2025,CUST2824,19,Other,Markland,Footwear,Sneakers,Adidas,1643,1,25,1232.25,Online,0,1
1,ORD00002,27-05-2025,CUST1488,23,Male,Port Erinmouth,Clothing,T-Shirts,Adidas,4964,4,5,18863.2,Retail,0,1
2,ORD00003,11-01-2025,CUST3615,45,Female,West Angelicaland,Footwear,Sneakers,Adidas,3257,1,0,3257.0,Retail,1,3
3,ORD00004,06-04-2025,CUST5333,20,Other,Lawrenceside,Footwear,Boots,H&M,3600,1,20,2880.0,Retail,0,5
4,ORD00005,20-04-2025,CUST6925,54,Male,Emilymouth,Accessories,Bags,Zara,2366,3,0,7098.0,Online,0,4


In [5]:
df.shape

(55000, 16)

In [None]:
# Check basic info and missing values
df.info()
df.isnull().sum()

In [None]:
# Summary statistics
df.describe(include='all')

In [None]:
# Distribution of numeric columns
df[['price', 'quantity', 'discount', 'final_price', 'rating']].hist(bins=20, figsize=(12, 8))
plt.tight_layout()

In [None]:
# Category distribution
sns.countplot(data=df, x='category')
plt.xticks(rotation=45)
plt.title('Category Distribution')

In [None]:
# Revenue by category
cat_revenue = df.groupby('category')['final_price'].sum().sort_values(ascending=False)
cat_revenue.plot(kind='bar', title='Revenue by Category')
plt.ylabel('Revenue')

In [None]:
# Average rating per brand
avg_rating = df.groupby('brand')['rating'].mean().sort_values(ascending=False)
avg_rating.plot(kind='barh', figsize=(8,10), title='Average Rating by Brand')
plt.xlabel('Average Rating')

In [None]:
# Return rate by category
return_rate = df.groupby('category')['return_flag'].mean().sort_values(ascending=False) * 100
return_rate.plot(kind='bar', title='Return Rate by Category')
plt.ylabel('Return Rate (%)')