In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
from collections import defaultdict
from datetime import datetime

warnings.filterwarnings('ignore')

# Set visualization parameters
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("Libraries imported successfully!")
print("=" * 80)

Libraries imported successfully!


In [3]:
# Check for missing values
print("Missing Values:")
print(df_consolidated.isnull().sum())
print("\n" + "=" * 80)

Missing Values:
product                                 0
time                                    0
avg.sku_price(₱)                  3497470
sold/day                          1826451
revenue/day(₱)                    1826451
sold/m                            1826451
product_sales_rate(%)             1826451
price(₱)                          3497470
sku                               3497470
sold                              6879663
sold/month(₱)                     1826451
revenue/month                     1826451
new_ratings                       1826451
ratings                           3497470
ratings_rate                      6879718
likes                             3497470
rating_star                       3497470
new_likes                         1826451
second-level_category                   0
third-level_category                    0
fourth-level_category                   0
fifth-level_category                    0
id                                      0
top-level_category

In [4]:
# Load categories with 'Others' season
seasons_path = '/Users/georcelle/ShopeeAnalysis_Amalgam/ShopeeAnalysis_Amalgam/HYBRID MODEL/hybrid_sarima_lstm_categories_unique_with_seasons.csv'
df_seasons = pd.read_csv(seasons_path)

# Get categories with 'Others' season
others_categories = df_seasons[df_seasons['season'] == 'Others'][['top_level_category', 'second_level_category']].drop_duplicates()
others_categories['category_pair'] = others_categories['top_level_category'] + '|' + others_categories['second_level_category']

print(f"Total categories with 'Others' season: {len(others_categories)}")
print("\nCategories to reassign:")
print(others_categories[['top_level_category', 'second_level_category']].to_string())

# Create a list for easier filtering
target_pairs = [(row['top_level_category'], row['second_level_category']) 
                for _, row in others_categories.iterrows()]
print(f"\n{len(target_pairs)} category pairs identified for seasonal reassignment")
print("=" * 80)

Total categories with 'Others' season: 73

Categories to reassign:
               top_level_category                    second_level_category
0                           Audio                      Amplifiers & Mixers
5                           Audio                            Media Players
6                           Audio                              Microphones
7                           Audio                                   Others
12                    Automobiles                   Automobile Spare Parts
13                    Automobiles                          Automotive Care
14                    Automobiles        Automotive Keychains & Key Covers
19            Baby & Kids Fashion                             Baby Clothes
20            Baby & Kids Fashion                  Baby Mittens & Footwear
21            Baby & Kids Fashion                              Boy Clothes
22            Baby & Kids Fashion                                Boy Shoes
23            Baby & Kids Fashion

In [5]:
# Convert date column if present
if 'date' in df_consolidated.columns:
    df_consolidated['date'] = pd.to_datetime(df_consolidated['date'])
    print("Date column found and converted to datetime")
elif 'Date' in df_consolidated.columns:
    df_consolidated['Date'] = pd.to_datetime(df_consolidated['Date'])
    df_consolidated['date'] = df_consolidated['Date']
    print("Date column found (uppercase) and converted")
else:
    # Check if there's any date-like column
    date_cols = [col for col in df_consolidated.columns if 'date' in col.lower() or 'time' in col.lower()]
    print(f"Available date-related columns: {date_cols}")
    if date_cols:
        date_col = date_cols[0]
        df_consolidated['date'] = pd.to_datetime(df_consolidated[date_col])
        print(f"Using {date_col} as date column")

# Extract month and year
if 'date' in df_consolidated.columns:
    df_consolidated['month'] = df_consolidated['date'].dt.month
    df_consolidated['year'] = df_consolidated['date'].dt.year
    df_consolidated['month_name'] = df_consolidated['date'].dt.strftime('%B')
    print(f"Date range: {df_consolidated['date'].min()} to {df_consolidated['date'].max()}")
    print(f"Years in dataset: {sorted(df_consolidated['year'].unique())}")
else:
    print("WARNING: No date column found in the dataset")

print("=" * 80)

Available date-related columns: ['time', 'listing_time']
Using time as date column
Using time as date column
Date range: 2022-03-01 00:00:00 to 2025-11-01 00:00:00
Years in dataset: [np.int32(2022), np.int32(2023), np.int32(2024), np.int32(2025)]
Date range: 2022-03-01 00:00:00 to 2025-11-01 00:00:00
Years in dataset: [np.int32(2022), np.int32(2023), np.int32(2024), np.int32(2025)]


In [7]:
# Define sales metric - find the appropriate column
sales_columns = [col for col in df_consolidated.columns if 'sales' in col.lower() or 'qty' in col.lower() or 'quantity' in col.lower() or 'amount' in col.lower()]
print(f"Available sales metrics: {sales_columns}")

# Determine which metric to use
sales_metric = None
if 'sold/month(₱)' in df_consolidated.columns:
    sales_metric = 'sold/month(₱)'
elif 'revenue/month' in df_consolidated.columns:
    sales_metric = 'revenue/month'
elif 'sold/day' in df_consolidated.columns:
    sales_metric = 'sold/day'
elif 'sold' in df_consolidated.columns:
    sales_metric = 'sold'
elif sales_columns:
    sales_metric = sales_columns[0]
else:
    sales_metric = 'sold'

print(f"Using '{sales_metric}' as sales metric")

# Fix column names (they use hyphens instead of underscores)
df_consolidated['top_level_category'] = df_consolidated['top-level_category']
df_consolidated['second_level_category'] = df_consolidated['second-level_category']

# Calculate monthly sales for each target category
monthly_sales_data = {}

for top_level, second_level in target_pairs:
    # Filter data for this category
    cat_data = df_consolidated[
        (df_consolidated['top_level_category'] == top_level) & 
        (df_consolidated['second_level_category'] == second_level)
    ].copy()
    
    if len(cat_data) == 0:
        continue
    
    # Group by month and sum sales
    if 'month' in cat_data.columns and sales_metric in cat_data.columns:
        monthly_data = cat_data.groupby('month')[sales_metric].sum().sort_index()
        category_key = f"{top_level}|{second_level}"
        monthly_sales_data[category_key] = monthly_data
    
print(f"Calculated monthly data for {len(monthly_sales_data)} categories")
print("=" * 80)

Available sales metrics: ['product_sales_rate(%)']
Using 'sold/month(₱)' as sales metric
Calculated monthly data for 73 categories
Calculated monthly data for 73 categories


In [8]:
# Define seasons and their corresponding months
seasons_definition = {
    'Christmas Season': [11, 12],  # Nov, Dec
    'Halloween / Undas Season': [10, 11],  # Oct, Nov
    'Rainy Season': [6, 7, 8, 9, 10, 11],  # Jun-Nov
    'Summer Season': [3, 4, 5],  # Mar-May
    'Back-to-School Season': [6, 7, 8, 9],  # Jun-Sep
    'Holy Week / Lent Season': [3, 4],  # Mar, Apr
    'Valentine\'s Season': [2]  # Feb
}

# Month name mapping
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
    7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'
}

# Analyze peak months for each category
category_peak_analysis = {}

for category_key, monthly_data in monthly_sales_data.items():
    if len(monthly_data) == 0:
        continue
    
    # Get peak months
    total_sales = monthly_data.sum()
    if total_sales == 0:
        continue
    
    # Calculate percentage contribution
    percentage = (monthly_data / total_sales * 100).round(2)
    
    # Find top months
    top_months = percentage.nlargest(3)
    peak_month_numbers = top_months.index.tolist()
    peak_month_names = [month_names[m] for m in peak_month_numbers]
    
    category_peak_analysis[category_key] = {
        'peak_months': peak_month_numbers,
        'peak_month_names': peak_month_names,
        'percentages': percentage.to_dict(),
        'monthly_sales': monthly_data.to_dict(),
        'total_sales': total_sales
    }

print(f"Peak analysis completed for {len(category_peak_analysis)} categories")
print("\nSample peak months analysis:")
for i, (cat, data) in enumerate(list(category_peak_analysis.items())[:5]):
    print(f"\n{cat}:")
    print(f"  Peak months: {data['peak_month_names']} (months: {data['peak_months']})")
    print(f"  Percentages: {data['percentages']}")
print("=" * 80)

Peak analysis completed for 73 categories

Sample peak months analysis:

Audio|Amplifiers & Mixers:
  Peak months: ['November', 'October', 'September'] (months: [11, 10, 9])
  Percentages: {1: 5.27, 2: 6.22, 3: 7.12, 4: 7.11, 5: 7.11, 6: 7.11, 7: 7.68, 8: 8.6, 9: 11.22, 10: 13.27, 11: 14.02, 12: 5.27}

Audio|Media Players:
  Peak months: ['November', 'October', 'September'] (months: [11, 10, 9])
  Percentages: {1: 6.89, 2: 6.92, 3: 7.23, 4: 8.13, 5: 8.49, 6: 8.51, 7: 8.57, 8: 8.92, 9: 9.39, 10: 9.77, 11: 10.36, 12: 6.83}

Audio|Microphones:
  Peak months: ['November', 'October', 'September'] (months: [11, 10, 9])
  Percentages: {1: 5.89, 2: 5.91, 3: 6.49, 4: 6.63, 5: 8.01, 6: 8.49, 7: 8.46, 8: 9.55, 9: 10.64, 10: 11.67, 11: 12.37, 12: 5.89}

Audio|Others:
  Peak months: ['November', 'October', 'September'] (months: [11, 10, 9])
  Percentages: {1: 7.11, 2: 7.11, 3: 7.59, 4: 7.6, 5: 7.92, 6: 8.5, 7: 8.57, 8: 8.88, 9: 9.48, 10: 9.94, 11: 10.19, 12: 7.1}

Automobiles|Automobile Spare Parts

In [None]:
def find_best_season_match(peak_months, seasons_definition):
    """
    Match peak months to the best season based on overlap and intensity
    """
    best_season = None
    best_overlap = 0
    best_score = 0
    
    for season_name, season_months in seasons_definition.items():
        # Calculate overlap
        overlap = len(set(peak_months) & set(season_months))
        
        # Calculate intensity score (percentage of peak months in season)
        if overlap > 0:
            intensity = overlap / len(peak_months)
            score = overlap + (intensity * 0.5)  # Weight overlap more heavily
            
            if score > best_score:
                best_score = score
                best_season = season_name
                best_overlap = overlap
    
    return best_season, best_overlap

# Map each category to a season
season_assignment_mapping = {}
assignment_details = {}

for category_key, peak_data in category_peak_analysis.items():
    peak_months = peak_data['peak_months']
    peak_names = peak_data['peak_month_names']
    
    # Find best matching season
    assigned_season, overlap = find_best_season_match(peak_months, seasons_definition)
    
    if assigned_season is None:
        # If no clear overlap, assign to the season with most month overlap
        assigned_season = 'Rainy Season'  # Default fallback
    
    season_assignment_mapping[category_key] = assigned_season
    assignment_details[category_key] = {
        'peak_months': peak_months,
        'peak_month_names': peak_names,
        'assigned_season': assigned_season,
        'monthly_percentages': peak_data['percentages'],
        'total_sales': peak_data['total_sales']
    }

print(f"Assigned {len(season_assignment_mapping)} categories to seasons\n")
print("Sample assignments:")
for i, (cat, season) in enumerate(list(season_assignment_mapping.items())[:10]):
    details = assignment_details[cat]
    print(f"\n{cat}")
    print(f"  Peak Months: {details['peak_month_names']}")
    print(f"  Assigned Season: {season}")
    print(f"  Monthly Sales Distribution:")
    for month_num, percentage in list(details['monthly_percentages'].items())[:3]:
        print(f"    {month_names[month_num]}: {percentage}%")
print("\n" + "=" * 80)

In [None]:
# Final Season Assignment Dictionary
final_season_mapping = dict(sorted(season_assignment_mapping.items()))

print("FINAL SEASON ASSIGNMENT MAPPING")
print("=" * 100)
print("\nFormat: 'Top Level Category|Second Level Category': 'Assigned Season'\n")

# Display in dictionary format
for category, season in final_season_mapping.items():
    print(f'"{category}": "{season}",')

print("\n" + "=" * 100)
print(f"\nTotal categories reassigned: {len(final_season_mapping)}")

# Summary by assigned season
season_counts = {}
for season in final_season_mapping.values():
    season_counts[season] = season_counts.get(season, 0) + 1

print("\nDistribution by Assigned Season:")
for season, count in sorted(season_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {season}: {count} categories")

In [None]:
# Export detailed analysis report
detailed_report = []

for category, season in sorted(final_season_mapping.items()):
    details = assignment_details[category]
    top_level, second_level = category.split('|')
    
    report_entry = {
        'top_level_category': top_level,
        'second_level_category': second_level,
        'category_pair': category,
        'assigned_season': season,
        'peak_months': details['peak_month_names'],
        'peak_month_numbers': details['peak_months'],
        'monthly_percentages': {month_names[m]: pct for m, pct in details['monthly_percentages'].items()},
        'total_sales': float(details['total_sales'])
    }
    detailed_report.append(report_entry)

# Create a DataFrame for better visualization
report_df = pd.DataFrame(detailed_report)

print("Detailed Assignment Report (First 10 entries):")
print(report_df[['category_pair', 'assigned_season', 'peak_months']].head(10).to_string())

# Export as JSON
json_output_path = '/Users/georcelle/ShopeeAnalysis_Amalgam/ShopeeAnalysis_Amalgam/season_assignment_mapping.json'
with open(json_output_path, 'w') as f:
    json.dump(final_season_mapping, f, indent=2)
print(f"\n✓ Exported season assignment mapping to: {json_output_path}")

# Export detailed report as JSON
detailed_report_path = '/Users/georcelle/ShopeeAnalysis_Amalgam/ShopeeAnalysis_Amalgam/season_assignment_detailed_report.json'
with open(detailed_report_path, 'w') as f:
    json.dump(detailed_report, f, indent=2)
print(f"✓ Exported detailed report to: {detailed_report_path}")

# Export as CSV
csv_output_path = '/Users/georcelle/ShopeeAnalysis_Amalgam/ShopeeAnalysis_Amalgam/season_assignment_report.csv'
report_df.to_csv(csv_output_path, index=False)
print(f"✓ Exported report to CSV: {csv_output_path}")

print("\n" + "=" * 100)

In [None]:
# Create comprehensive summary statistics
print("COMPREHENSIVE SUMMARY")
print("=" * 100)
print(f"\nTotal Categories Analyzed: {len(final_season_mapping)}")
print(f"Total Sales Metric (units): {sum([assignment_details[cat]['total_sales'] for cat in final_season_mapping]):.0f}")

print("\n\nSEASON DISTRIBUTION:")
print("-" * 100)
for season in sorted(seasons_definition.keys()):
    count = sum(1 for s in final_season_mapping.values() if s == season)
    pct = (count / len(final_season_mapping) * 100) if len(final_season_mapping) > 0 else 0
    print(f"{season:40} : {count:3d} categories ({pct:5.1f}%)")

print("\n\nTOP CATEGORIES BY TOTAL SALES:")
print("-" * 100)
top_categories = sorted(
    [(cat, assignment_details[cat]['total_sales']) for cat in final_season_mapping],
    key=lambda x: x[1],
    reverse=True
)[:10]

for i, (cat, sales) in enumerate(top_categories, 1):
    season = final_season_mapping[cat]
    print(f"{i:2d}. {cat:60} | {season:30} | Sales: {sales:12,.0f}")

print("\n" + "=" * 100)

## Section 8: Export Results and Create Detailed Report

## Section 7: Generate and Validate Season Assignment Dictionary

## Section 6: Map Categories to Seasons Based on Demand Patterns

## Section 5: Identify Peak Demand Months

## Section 4: Calculate Monthly Sales by Category Pair

## Section 3: Extract Date Information and Prepare for Monthly Analysis

## Section 2: Load Categories with 'Others' Season

In [2]:
# Load consolidated data
consolidated_path = '/Users/georcelle/ShopeeAnalysis_Amalgam/ShopeeAnalysis_Amalgam/HYBRID MODEL/consolidated_file_cleaned_v2.csv'
df_consolidated = pd.read_csv(consolidated_path)

print("Consolidated Dataset Information:")
print(f"Shape: {df_consolidated.shape}")
print(f"\nColumn Names and Types:")
print(df_consolidated.dtypes)
print(f"\nFirst few rows:")
df_consolidated.head()

Consolidated Dataset Information:
Shape: (7554662, 28)

Column Names and Types:
product                            object
time                               object
avg.sku_price(₱)                  float64
sold/day                          float64
revenue/day(₱)                    float64
sold/m                            float64
product_sales_rate(%)             float64
price(₱)                          float64
sku                               float64
sold                              float64
sold/month(₱)                     float64
revenue/month                     float64
new_ratings                       float64
ratings                           float64
ratings_rate                      float64
likes                             float64
rating_star                       float64
new_likes                         float64
second-level_category              object
third-level_category               object
fourth-level_category              object
fifth-level_category               obj

Unnamed: 0,product,time,avg.sku_price(₱),sold/day,revenue/day(₱),sold/m,product_sales_rate(%),price(₱),sku,sold,sold/month(₱),revenue/month,new_ratings,ratings,ratings_rate,likes,rating_star,new_likes,second-level_category,third-level_category,fourth-level_category,fifth-level_category,id,top-level_category,seller_from,listing_time,active_months,suitable_for_seasonal_analysis
0,Cute Different Designs button accessories ...,2022-03-01,,,,,,,,,,,,,,,,,Additional Accessories,"Charms, Pendants & Ornaments",-,-,17287885303,Fashion Accessories,Overseas,2022-10-18,37,True
1,Cute Different Designs button accessories ...,2022-04-01,,,,,,,,,,,,,,,,,Additional Accessories,"Charms, Pendants & Ornaments",-,-,17287885303,Fashion Accessories,Overseas,2022-10-18,37,True
2,Cute Different Designs button accessories ...,2022-05-01,,,,,,,,,,,,,,,,,Additional Accessories,"Charms, Pendants & Ornaments",-,-,17287885303,Fashion Accessories,Overseas,2022-10-18,37,True
3,Cute Different Designs button accessories ...,2022-06-01,,,,,,,,,,,,,,,,,Additional Accessories,"Charms, Pendants & Ornaments",-,-,17287885303,Fashion Accessories,Overseas,2022-10-18,37,True
4,Cute Different Designs button accessories ...,2022-07-01,,,,,,,,,,,,,,,,,Additional Accessories,"Charms, Pendants & Ornaments",-,-,17287885303,Fashion Accessories,Overseas,2022-10-18,37,True


## Section 1: Load and Explore Consolidated Dataset

# Seasonal Demand Analysis for Shopee Categories
Analyzing consolidated_file_cleaned_v2.csv to determine seasonal demand patterns and reassign categories currently labeled as 'Others' season to appropriate seasons based on monthly sales peaks.