In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
import time
import os
from pathlib import Path

# Path settings - using forward slashes for consistency
TRAIN_PATH = "E:/sunday/data_2025/2025/train"
TEST_PATH = "E:/sunday/data_2025/2025/test"

print("Starting Etsy Product Classification Project")
print("Using Random Forest, SGD Classifier, and Naive Bayes models")

##############################################
# 1. DATA LOADING WITH MEMORY OPTIMIZATION
##############################################
def load_parquet_files(directory, sample_size=None):
    """
    Load parquet files with optional sampling for faster development
    """
    all_files = [f for f in os.listdir(directory) if f.endswith('.parquet')]
    
    if sample_size:
        # For development: randomly select a subset of files
        import random
        random.seed(42)
        selected_files = random.sample(all_files, min(sample_size, len(all_files)))
    else:
        selected_files = all_files
    
    print(f"Loading {len(selected_files)} files from {directory}...")
    
    # Load files one by one to avoid memory issues
    dataframes = []
    for i, file in enumerate(selected_files):
        file_path = os.path.join(directory, file)
        try:
            df = pd.read_parquet(file_path)
            dataframes.append(df)
            # Print progress every 5 files
            if (i+1) % 5 == 0:
                print(f"Loaded {i+1}/{len(selected_files)} files")
        except Exception as e:
            print(f"Error loading {file}: {e}")
    
    if dataframes:
        return pd.concat(dataframes, ignore_index=True)
    else:
        return pd.DataFrame()

# Load datasets with option to use a smaller sample for development
use_sample = False  # Set to True for faster development with a sample
sample_size = 5 if use_sample else None

try:
    train_df = load_parquet_files(TRAIN_PATH, sample_size)
    test_df = load_parquet_files(TEST_PATH, sample_size)
    print(f"Train data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
except Exception as e:
    print(f"Error loading data: {e}")
    print("\nCreating synthetic data for demonstration purposes...")
    
    # Create synthetic data for demonstration if files can't be loaded
    np.random.seed(42)
    
    # Create dummy train data
    n_samples = 5000
    train_df = pd.DataFrame({
        'product_id': [f"prod_{i}" for i in range(n_samples)],
        'title': [f"Product title {i}" for i in range(n_samples)],
        'description': [f"Description for product {i} with details" for i in range(n_samples)],
        'tags': [f"tag1, tag2, tag{i%10}" for i in range(n_samples)],
        'type': np.random.choice(['physical', 'download'], n_samples),
        'room': np.random.choice(['bedroom', 'living room', 'kitchen', 'bathroom'], n_samples),
        'top_category_id': np.random.choice([0, 5, 6, 8, 13], n_samples, p=[0.1, 0.25, 0.25, 0.3, 0.1]),
        'top_category_text': np.random.choice(['category_0', 'category_5', 'category_6', 'category_8', 'category_13'], n_samples),
        'bottom_category_id': np.random.randint(1, 100, n_samples),
        'bottom_category_text': [f"subcategory_{i%100}" for i in range(n_samples)],
        'primary_color_id': np.random.randint(1, 10, n_samples),
        'primary_color_text': np.random.choice(['red', 'blue', 'green', 'yellow', 'black'], n_samples),
        'secondary_color_id': np.random.randint(1, 10, n_samples),
        'secondary_color_text': np.random.choice(['red', 'blue', 'green', 'yellow', 'black'], n_samples)
    })
    
    # Create dummy test data
    n_test = 1000
    test_df = pd.DataFrame({
        'product_id': [f"test_prod_{i}" for i in range(n_test)],
        'title': [f"Test Product title {i}" for i in range(n_test)],
        'description': [f"Test Description for product {i} with details" for i in range(n_test)],
        'tags': [f"tag1, tag2, tag{i%10}" for i in range(n_test)],
        'type': np.random.choice(['physical', 'download'], n_test),
        'room': np.random.choice(['bedroom', 'living room', 'kitchen', 'bathroom'], n_test)
    })

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)

# Check missing values
missing_values = train_df.isnull().sum()
print("\nMissing values in train data:")
print(missing_values[missing_values > 0])

# Text preprocessing function
def preprocess_text(df):
    # Combine text fields efficiently
    df['combined_text'] = df['title'].fillna('') + ' ' + df['description'].fillna('') + ' ' + df['tags'].fillna('')
    
    # Fill NaN values in categorical columns
    text_cols = ['title', 'description', 'tags']
    cat_cols = ['type', 'room', 'craft_type', 'recipient', 'material', 'occasion', 
                'holiday', 'art_subject', 'style', 'shape', 'pattern',
                'primary_color_text', 'secondary_color_text']
    
    for col in text_cols + cat_cols:
        if col in df.columns:
            df[col] = df[col].fillna('')
    
    return df

# Preprocess text
print("\n=== TEXT PREPROCESSING ===")
print("Applying text preprocessing...")
train_df = preprocess_text(train_df)
test_df = preprocess_text(test_df)
print("Text preprocessing completed")

# Explore target distributions
print("\nTop category distribution:")
top_cat_dist = train_df['top_category_id'].value_counts()
print(f"Number of unique top categories: {len(top_cat_dist)}")
print("Top 5 most common categories: ")
print(top_cat_dist.head())

print("\nBottom category distribution:")
bottom_cat_dist = train_df['bottom_category_id'].value_counts()
print(f"Number of unique bottom categories: {len(bottom_cat_dist)}")
print("Top 5 most common categories: ")
print(bottom_cat_dist.head())

# Create visualizations directory
os.makedirs('visualizations', exist_ok=True)

# VISUALIZATION 1: Top Category Distribution
plt.figure(figsize=(10, 5))
top_cat_counts = train_df['top_category_id'].value_counts().head(15).sort_values(ascending=False)
sns.barplot(x=top_cat_counts.index, y=top_cat_counts.values)
plt.title('Distribution of Top Categories')
plt.ylabel('Count')
plt.xlabel('Top Category ID')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('visualizations/top_category_distribution.png')
plt.close()

# Feature Engineering
print("\n=== FEATURE ENGINEERING ===")
print("Encoding target variables...")
print(f"Number of unique top categories: {len(train_df['top_category_id'].unique())}")
print(f"Number of unique bottom categories: {len(train_df['bottom_category_id'].unique())}")

# Encode target variables
top_encoder = LabelEncoder()
bottom_encoder = LabelEncoder()

train_df['top_category_encoded'] = top_encoder.fit_transform(train_df['top_category_id'])
train_df['bottom_category_encoded'] = bottom_encoder.fit_transform(train_df['bottom_category_id'])

# Create text features using hashing vectorizer - using fewer features for speed
n_features = 2048  # Reduce features for faster processing
print("Creating text features using hashing vectorizer...")
vectorizer = HashingVectorizer(n_features=n_features, alternate_sign=False)

X_train_text = vectorizer.transform(train_df['combined_text'])
print(f"Hashed features shape: {X_train_text.shape}")

# Create target variables
y_train_top = train_df['top_category_encoded'].values  # Convert to numpy for faster operations
y_train_bottom = train_df['bottom_category_encoded'].values  # Convert to numpy for faster operations

# Use a smaller validation set for faster evaluation
X_train, X_val, y_train_top, y_val_top, y_train_bottom, y_val_bottom = train_test_split(
    X_train_text, y_train_top, y_train_bottom, test_size=0.1, random_state=42)

print("\n=== MODEL TRAINING - TOP CATEGORY ===")
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

# VISUALIZATION 2: Dimensionality Reduction for Text Features
print("\nPerforming dimensionality reduction for visualization...")
# Use TruncatedSVD to reduce dimensions for visualization - on a subset of data
# Convert to numpy array first to avoid pandas indexing issues
y_train_top_np = np.array(y_train_top)

# Use only first 2000 samples for visualization for speed
sample_size = 2000 
X_sample = X_train[:sample_size]
y_sample = y_train_top_np[:sample_size]

svd = TruncatedSVD(n_components=2, random_state=42)
X_2d = svd.fit_transform(X_sample)

plt.figure(figsize=(10, 6))
# Plot only the most common categories for clarity
unique_categories = np.unique(y_sample)
top_categories = unique_categories[:min(5, len(unique_categories))]  # Show only top 5 categories
for category in top_categories:
    mask = y_sample == category
    plt.scatter(X_2d[mask, 0], X_2d[mask, 1], label=f'Cat {category}', alpha=0.6, s=10)

plt.title('2D Projection of Text Features (Top 5 Categories)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.legend()
plt.tight_layout()
plt.savefig('visualizations/text_features_2d.png')
plt.close()

# Train and evaluate models with optimized parameters for speed
def train_and_evaluate_models(X_train, X_val, y_train, y_val, target_name, is_bottom_category=False):
    # Configure models for speed (fewer trees, less depth, etc.)
    if is_bottom_category:
        models = {
            'Random Forest': RandomForestClassifier(n_estimators=30, max_depth=8, 
                                                  min_samples_split=100, min_samples_leaf=20,
                                                  class_weight='balanced_subsample', 
                                                  random_state=42, n_jobs=-1),
            'SGD Classifier': SGDClassifier(loss='log_loss', penalty='l2', alpha=1e-3, 
                                          max_iter=3, tol=1e-2, early_stopping=True, 
                                          n_iter_no_change=2,
                                          random_state=42, n_jobs=-1),
            'Multinomial Naive Bayes': MultinomialNB(alpha=0.1)
        }
    else:
        models = {
            'Random Forest': RandomForestClassifier(n_estimators=40, max_depth=10, 
                                                  min_samples_split=50, min_samples_leaf=10,
                                                  random_state=42, n_jobs=-1),
            'SGD Classifier': SGDClassifier(loss='log_loss', penalty='l2', alpha=1e-3, 
                                          max_iter=5, early_stopping=True, 
                                          random_state=42, n_jobs=-1),
            'Multinomial Naive Bayes': MultinomialNB(alpha=0.1)
        }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name} for {target_name}...")
        start_time = time.time()
        
        # Set a time limit of 3 minutes (180 seconds) for model training
        time_limit = 180  # seconds
        
        try:
            model.fit(X_train, y_train)
            train_time = time.time() - start_time
            
            # If training takes too long, print warning
            if train_time > time_limit:
                print(f"Warning: {name} training took longer than expected ({train_time:.2f}s)")
            
            print(f"Predicting with {name}...")
            y_pred = model.predict(X_val)
            
            # Calculate metrics
            f1 = f1_score(y_val, y_pred, average='weighted')
            
            results[name] = {
                'model': model,
                'f1_score': f1,
                'training_time': train_time
            }
            
            print(f"{name} F1 Score: {f1:.4f}")
            print(f"Training time: {train_time:.2f} seconds")
            
        except Exception as e:
            print(f"Error training {name}: {e}")
            # If an error occurs, still add the model to results with zero F1 score
            results[name] = {
                'model': model,
                'f1_score': 0.0,
                'training_time': time.time() - start_time,
                'error': str(e)
            }
    
    return results

# Train models for top category
print("\nTraining models for Top Category Classification...")
top_category_results = train_and_evaluate_models(X_train, X_val, y_train_top, y_val_top, "Top Category")

# Train models for bottom category with better filtering
print("\nTraining models for Bottom Category Classification...")
# Convert to pandas Series for value_counts
bottom_class_counts = pd.Series(y_train_bottom).value_counts()

# IMPORTANT: Make sure we have at least some samples by using a more reasonable threshold
# Find a threshold that gives us at least some classes but not too many
for threshold in [50, 25, 10, 5, 2, 1]:
    common_bottom_classes = bottom_class_counts[bottom_class_counts >= threshold].index.tolist()
    if len(common_bottom_classes) > 0:
        break

# If we still don't have enough classes, use the most frequent ones
if len(common_bottom_classes) == 0:
    common_bottom_classes = bottom_class_counts.nlargest(20).index.tolist()
elif len(common_bottom_classes) > 200:  # Limit to a reasonable number
    common_bottom_classes = bottom_class_counts.nlargest(200).index.tolist()

# Filter data to only include common classes
mask_train = np.isin(y_train_bottom, common_bottom_classes)
mask_val = np.isin(y_val_bottom, common_bottom_classes)

X_train_bottom = X_train[mask_train]
y_train_bottom_filtered = y_train_bottom[mask_train]
X_val_bottom = X_val[mask_val]
y_val_bottom_filtered = y_val_bottom[mask_val]

print(f"Using {len(common_bottom_classes)} common bottom categories")
print(f"Filtered training set shape: {X_train_bottom.shape}")
print(f"Filtered validation set shape: {X_val_bottom.shape}")

# Make sure we have at least some data
if X_train_bottom.shape[0] > 0 and X_val_bottom.shape[0] > 0:
    bottom_category_results = train_and_evaluate_models(
        X_train_bottom, X_val_bottom, 
        y_train_bottom_filtered, y_val_bottom_filtered, 
        "Bottom Category", is_bottom_category=True
    )
else:
    print("ERROR: Insufficient data for bottom category after filtering")
    # Create dummy results
    bottom_category_results = {
        'Multinomial Naive Bayes': {
            'model': MultinomialNB(),
            'f1_score': 0.0,
            'training_time': 0.0
        }
    }

# VISUALIZATION 3: Text Length Distribution
train_df['title_length'] = train_df['title'].apply(len)
train_df['description_length'] = train_df['description'].apply(len)

plt.figure(figsize=(10, 5))
sns.histplot(train_df['title_length'], kde=True, label='Title Length')
plt.title('Distribution of Title Lengths')
plt.xlabel('Length')
plt.ylabel('Count')
plt.legend()
plt.tight_layout()
plt.savefig('visualizations/title_length_distribution.png')
plt.close()

# VISUALIZATION 4: Model Performance Comparison
plt.figure(figsize=(10, 5))
model_names = list(top_category_results.keys())
f1_scores_top = [results['f1_score'] for results in top_category_results.values()]
f1_scores_bottom = [results['f1_score'] for results in bottom_category_results.values()]

x = np.arange(len(model_names))
width = 0.35

plt.bar(x - width/2, f1_scores_top, width, label='Top Category')
plt.bar(x + width/2, f1_scores_bottom, width, label='Bottom Category')

plt.xlabel('Model')
plt.ylabel('F1 Score (Weighted)')
plt.title('Model Performance Comparison')
plt.xticks(x, model_names)
plt.legend()
plt.tight_layout()
plt.savefig('visualizations/model_performance_comparison.png')
plt.close()

# Prepare test data
X_test_text = vectorizer.transform(test_df['combined_text'])

# Get the best model for top category prediction
best_model_top = max(top_category_results.items(), key=lambda x: x[1]['f1_score'])
print(f"\nBest model for top category: {best_model_top[0]} (F1: {best_model_top[1]['f1_score']:.4f})")

# Get the best model for bottom category prediction
best_model_bottom = max(bottom_category_results.items(), key=lambda x: x[1]['f1_score'])
print(f"Best model for bottom category: {best_model_bottom[0]} (F1: {best_model_bottom[1]['f1_score']:.4f})")

# Make predictions on test data
print("\nMaking predictions on test data...")

# For top category, use the best model or fallback to MultinomialNB if issues occur
try:
    # Use the best model for top category prediction
    best_model_top = max(top_category_results.items(), key=lambda x: x[1]['f1_score'])
    print(f"\nBest model for top category: {best_model_top[0]} (F1: {best_model_top[1]['f1_score']:.4f})")
    y_pred_top = best_model_top[1]['model'].predict(X_test_text)
    test_df['predicted_top_category'] = top_encoder.inverse_transform(y_pred_top)
except Exception as e:
    print(f"Error predicting top categories: {e}")
    # Fallback - use Multinomial Naive Bayes which is fast and reliable
    print("Using MultinomialNB fallback for top category")
    mnb = MultinomialNB().fit(X_train, y_train_top)
    y_pred_top = mnb.predict(X_test_text)
    test_df['predicted_top_category'] = top_encoder.inverse_transform(y_pred_top)

# For bottom category, be even more cautious with prediction
try:
    # First check if we have valid models
    best_model_bottom = max(bottom_category_results.items(), key=lambda x: x[1]['f1_score'])
    print(f"Best model for bottom category: {best_model_bottom[0]} (F1: {best_model_bottom[1]['f1_score']:.4f})")
    
    # Only proceed if the model actually trained (F1 > 0)
    if best_model_bottom[1]['f1_score'] > 0:
        y_pred_bottom = best_model_bottom[1]['model'].predict(X_test_text)
        
        # For bottom categories, need to be more careful with mapping
        # Get the most common bottom category to use as a fallback
        most_common_bottom = bottom_cat_dist.index[0]
        test_df['predicted_bottom_category'] = most_common_bottom  # Default value
        
        # Map predicted bottom categories
        try:
            mapped_categories = bottom_encoder.inverse_transform(y_pred_bottom)
            test_df['predicted_bottom_category'] = mapped_categories
        except Exception as e:
            print(f"Error mapping bottom categories: {e}")
    else:
        # Model didn't train properly - use fallback
        raise ValueError("Bottom category model didn't train properly")
        
except Exception as e:
    print(f"Error in bottom category prediction: {e}")
    print("Using fallback for bottom category prediction...")
    most_common_bottom = bottom_cat_dist.index[0]
    test_df['predicted_bottom_category'] = most_common_bottom

# Save predictions
predictions = test_df[['product_id', 'predicted_top_category', 'predicted_bottom_category']]
predictions.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv")
print("Visualizations saved to the 'visualizations' directory")

# Print summary of results
print("\n=== SUMMARY OF RESULTS ===")
print("Top Category Classification:")
for name, results in top_category_results.items():
    print(f"  {name}: F1 Score = {results['f1_score']:.4f}, Training Time = {results['training_time']:.2f}s")

print("\nBottom Category Classification (filtered classes):")
for name, results in bottom_category_results.items():
    print(f"  {name}: F1 Score = {results['f1_score']:.4f}, Training Time = {results['training_time']:.2f}s")

Starting Etsy Product Classification Project
Using Random Forest, SGD Classifier, and Naive Bayes models
Loading 362 files from E:/sunday/data_2025/2025/train...
Loaded 5/362 files
Loaded 10/362 files
Loaded 15/362 files
Loaded 20/362 files
Loaded 25/362 files
Loaded 30/362 files
Loaded 35/362 files
Loaded 40/362 files
Loaded 45/362 files
Loaded 50/362 files
Loaded 55/362 files
Loaded 60/362 files
Loaded 65/362 files
Loaded 70/362 files
Loaded 75/362 files
Loaded 80/362 files
Loaded 85/362 files
Loaded 90/362 files
Loaded 95/362 files
Loaded 100/362 files
Loaded 105/362 files
Loaded 110/362 files
Loaded 115/362 files
Loaded 120/362 files
Loaded 125/362 files
Loaded 130/362 files
Loaded 135/362 files
Loaded 140/362 files
Loaded 145/362 files
Loaded 150/362 files
Loaded 155/362 files
Loaded 160/362 files
Loaded 165/362 files
Loaded 170/362 files
Loaded 175/362 files
Loaded 180/362 files
Loaded 185/362 files
Loaded 190/362 files
Loaded 195/362 files
Loaded 200/362 files
Loaded 205/362 fil

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):



=== FEATURE ENGINEERING ===
Encoding target variables...
Number of unique top categories: 15
Number of unique bottom categories: 2609
Creating text features using hashing vectorizer...
Hashed features shape: (229624, 2048)

=== MODEL TRAINING - TOP CATEGORY ===
Training set shape: (206661, 2048)
Validation set shape: (22963, 2048)

Performing dimensionality reduction for visualization...

Training models for Top Category Classification...

Training Random Forest for Top Category...
Predicting with Random Forest...
Random Forest F1 Score: 0.3245
Training time: 48.81 seconds

Training SGD Classifier for Top Category...




Predicting with SGD Classifier...
SGD Classifier F1 Score: 0.3749
Training time: 8.52 seconds

Training Multinomial Naive Bayes for Top Category...
Predicting with Multinomial Naive Bayes...
Multinomial Naive Bayes F1 Score: 0.5652
Training time: 0.85 seconds

Training models for Bottom Category Classification...
Using 200 common bottom categories
Filtered training set shape: (17555, 2048)
Filtered validation set shape: (1178, 2048)

Training Random Forest for Bottom Category...
Predicting with Random Forest...
Random Forest F1 Score: 0.3135
Training time: 3.92 seconds

Training SGD Classifier for Bottom Category...




Predicting with SGD Classifier...
SGD Classifier F1 Score: 0.6573
Training time: 5.59 seconds

Training Multinomial Naive Bayes for Bottom Category...
Predicting with Multinomial Naive Bayes...
Multinomial Naive Bayes F1 Score: 0.8202
Training time: 0.81 seconds


  if pd.api.types.is_categorical_dtype(vector):
  with pd.option_context('mode.use_inf_as_na', True):



Best model for top category: Multinomial Naive Bayes (F1: 0.5652)
Best model for bottom category: Multinomial Naive Bayes (F1: 0.8202)

Making predictions on test data...

Best model for top category: Multinomial Naive Bayes (F1: 0.5652)
Best model for bottom category: Multinomial Naive Bayes (F1: 0.8202)
Predictions saved to predictions.csv
Visualizations saved to the 'visualizations' directory

=== SUMMARY OF RESULTS ===
Top Category Classification:
  Random Forest: F1 Score = 0.3245, Training Time = 48.81s
  SGD Classifier: F1 Score = 0.3749, Training Time = 8.52s
  Multinomial Naive Bayes: F1 Score = 0.5652, Training Time = 0.85s

Bottom Category Classification (filtered classes):
  Random Forest: F1 Score = 0.3135, Training Time = 3.92s
  SGD Classifier: F1 Score = 0.6573, Training Time = 5.59s
  Multinomial Naive Bayes: F1 Score = 0.8202, Training Time = 0.81s
