# 3-Class Transformer Sentiment Pipeline

**High-level summary:**  
An end-to-end PyTorch Transformer pipeline that builds an enhanced tokenizer & vocabulary, defines a pre-layer-norm multi-head attention Transformer, trains with OneCycleLR & label smoothing on imbalanced data, evaluates with detailed metrics, and provides an inference utility for new text.

In [16]:
# prompt: connect google drive

from google.colab import drive
drive.mount('/content/drive')

# prompt: load current directory

import os

os.chdir('/content/drive/My Drive/CS605-NLP-Project')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import pandas as pd
import numpy as np

class FinalScoreAnalyzer:
    def __init__(self, alpha=0.5):
        """
        Initialize analyzer with weight parameter
        alpha: weight for true_label (default 0.5 for equal weighting)
        """
        # Target categories to analyze
        self.target_categories = [
                    "ice cream stand",
                    "pop a nana",
                    "super hungry food stand",
                    "mel s drive in",
                    "pops popcorn delight",
                    "starbucks",
                    "kt s grill",
                    "loui s ny pizza parlor",
                    "me want cookie",
                    "frozen fuel",
                    "planet yen",
                    "starbot café",
                    "stardots",
                    "discovery food court",
                    "fossil fuels",
                    "jungle bites",
                    "friar s good food",
                    "goldilocks"
        ]

        self.alpha = alpha

    def calculate_final_scores(self, input_file, output_file=None, alpha=None):
        """
        Calculate final scores using formula: final_score = a * true_label + (1-a) * ensemble_prediction
        """
        if alpha is not None:
            self.alpha = alpha

        try:
            # Read the input file
            if input_file.lower().endswith('.csv'):
                df = pd.read_csv(input_file)
            else:
                df = pd.read_excel(input_file)

            print(f"Loaded {len(df)} records from {input_file}")
            print(f"Using alpha = {self.alpha} (weight for true_label)")

            # Check required columns
            required_cols = ['Matched_Categories', 'true_label', 'ensemble_prediction']
            missing_cols = [col for col in required_cols if col not in df.columns]

            if missing_cols:
                print(f"Error: Missing required columns: {missing_cols}")
                print(f"Available columns: {list(df.columns)}")
                return None, None

            # Initialize results dataframe
            results_df = df.copy()

            # Add category-specific columns
            for category in self.target_categories:
                clean_name = category.replace(' ', '_').replace("'", "").replace('-', '_')
                results_df[f'category_match_{clean_name}'] = 0
                results_df[f'final_score_{clean_name}'] = 0.0

            # Process each row
            for idx, row in results_df.iterrows():
                matched_categories = str(row['Matched_Categories'])
                true_label = row['true_label']
                ensemble_prediction = row['ensemble_prediction']

                # Validate values are in expected range (0, 1, 2)
                if true_label not in [0, 1, 2] or ensemble_prediction not in [0, 1, 2]:
                    print(f"Warning: Row {idx} has unexpected values - true_label: {true_label}, ensemble_prediction: {ensemble_prediction}")

                if matched_categories and matched_categories != '' and matched_categories != 'nan':
                    # Split multiple categories
                    categories = [cat.strip() for cat in matched_categories.split('|')]

                    # Check each target category
                    for target_category in self.target_categories:
                        clean_name = target_category.replace(' ', '_').replace("'", "").replace('-', '_')

                        # Check if this target category appears in matched categories
                        if target_category in categories:
                            results_df.at[idx, f'category_match_{clean_name}'] = 1

                            # Calculate final score using the formula
                            final_score = self.alpha * true_label + (1 - self.alpha) * ensemble_prediction
                            results_df.at[idx, f'final_score_{clean_name}'] = final_score

            # Save individual results if output file specified
            if output_file:
                if output_file.lower().endswith('.csv'):
                    results_df.to_csv(output_file, index=False)
                else:
                    results_df.to_excel(output_file, index=False)
                print(f"Individual results saved to {output_file}")

            # Create grouped analysis
            grouped_results = self.create_grouped_analysis(results_df, min_count=30)

            return results_df, grouped_results

        except Exception as e:
            print(f"Error: {str(e)}")
            return None, None

    def create_grouped_analysis(self, df, min_count=30):
        """
        Group by categories and calculate mean final scores, ranked by mean final score
        Only show categories with count >= min_count
        """
        print("\n" + "="*80)
        print("FINAL SCORE ANALYSIS - GROUPED BY CATEGORIES")
        print("="*80)

        grouped_data = []

        for category in self.target_categories:
            clean_name = category.replace(' ', '_').replace("'", "").replace('-', '_')

            category_match_col = f'category_match_{clean_name}'
            final_score_col = f'final_score_{clean_name}'

            if category_match_col in df.columns and final_score_col in df.columns:
                # Only calculate for rows where this category was matched
                category_rows = df[df[category_match_col] == 1]

                if len(category_rows) > 0:
                    total_occurrences = len(category_rows)
                    mean_final_score = category_rows[final_score_col].mean()
                    min_final_score = category_rows[final_score_col].min()
                    max_final_score = category_rows[final_score_col].max()
                    std_final_score = category_rows[final_score_col].std()

                    grouped_data.append({
                        'category': category,
                        'total_occurrences': int(total_occurrences),
                        'mean_final_score': mean_final_score,
                        'min_final_score': min_final_score,
                        'max_final_score': max_final_score,
                        'std_final_score': std_final_score if not pd.isna(std_final_score) else 0.0
                    })

        # Create grouped DataFrame
        grouped_df = pd.DataFrame(grouped_data)

        # Filter by minimum count
        filtered_df = grouped_df[grouped_df['total_occurrences'] >= min_count].copy()

        # Sort by mean final score (descending) - this is the ranking you requested
        filtered_df = filtered_df.sort_values('mean_final_score', ascending=False)
        filtered_df = filtered_df.reset_index(drop=True)
        filtered_df['rank'] = filtered_df.index + 1

        # Display results
        print(f"\nFINAL SCORE RANKINGS (Alpha = {self.alpha}, Min Count = {min_count}):")
        print("-" * 100)

        if len(filtered_df) > 0:
            print(f"{'Rank':<5} {'Category':<45} {'Count':<8} {'Mean Score':<12} {'Min':<8} {'Max':<8} {'Std':<8}")
            print("-" * 100)

            for _, row in filtered_df.iterrows():
                print(f"{row['rank']:<5} {row['category']:<45} {row['total_occurrences']:<8} "
                      f"{row['mean_final_score']:<12.4f} {row['min_final_score']:<8.2f} "
                      f"{row['max_final_score']:<8.2f} {row['std_final_score']:<8.4f}")
        else:
            print(f"No categories found with count >= {min_count}")

        # Additional insights
        print(f"\n" + "="*80)
        print("ANALYSIS SUMMARY:")
        print("="*80)

        total_records = len(df)
        categories_found = len(grouped_df)
        categories_displayed = len(filtered_df)

        print(f"📊 Total records analyzed: {total_records}")
        print(f"🎯 Categories found: {categories_found}/{len(self.target_categories)}")
        print(f"📈 Categories displayed (count >= {min_count}): {categories_displayed}/{categories_found}")
        print(f"⚖️  Alpha (true_label weight): {self.alpha}")
        print(f"⚖️  Beta (ensemble_prediction weight): {1-self.alpha}")

        if len(filtered_df) > 0:
            print(f"🏆 Top ranked category: {filtered_df.iloc[0]['category']} (score: {filtered_df.iloc[0]['mean_final_score']:.4f})")
            if len(filtered_df) > 1:
                print(f"🥈 Second ranked: {filtered_df.iloc[1]['category']} (score: {filtered_df.iloc[1]['mean_final_score']:.4f})")
            if len(filtered_df) > 2:
                print(f"🥉 Third ranked: {filtered_df.iloc[2]['category']} (score: {filtered_df.iloc[2]['mean_final_score']:.4f})")

        # Show categories that were filtered out (below min_count)
        below_threshold = grouped_df[grouped_df['total_occurrences'] < min_count]
        if len(below_threshold) > 0:
            print(f"\n📋 Categories below threshold (count < {min_count}):")
            for _, row in below_threshold.sort_values('total_occurrences', ascending=False).iterrows():
                print(f"   - {row['category']}: {row['total_occurrences']} occurrences")

        # Categories not found at all
        found_categories = set(grouped_df['category'].tolist())
        not_found = [cat for cat in self.target_categories if cat not in found_categories]
        if not_found:
            print(f"\n🔍 Categories not found ({len(not_found)}):")
            for cat in not_found:
                print(f"   - {cat}")

        print("="*80)

        return filtered_df

    def save_grouped_results(self, grouped_df, filename):
        """Save grouped results to file"""
        if filename.lower().endswith('.csv'):
            grouped_df.to_csv(filename, index=False)
        else:
            grouped_df.to_excel(filename, index=False)
        print(f"Grouped results saved to {filename}")

    def analyze_with_different_alphas(self, input_file, alphas=[0.3, 0.5, 0.7]):
        """
        Compare results with different alpha values
        """
        print("COMPARING DIFFERENT ALPHA VALUES:")
        print("="*50)

        all_results = {}

        for alpha in alphas:
            print(f"\nAnalyzing with alpha = {alpha}")
            print("-" * 30)

            _, grouped_df = self.calculate_final_scores(input_file, alpha=alpha)
            if grouped_df is not None and len(grouped_df) > 0:
                all_results[alpha] = grouped_df[['category', 'mean_final_score']].copy()
                all_results[alpha].columns = ['category', f'score_alpha_{alpha}']

                # Show top 5
                print(f"Top 5 categories (alpha={alpha}):")
                for i, row in grouped_df.head(5).iterrows():
                    print(f"  {i+1}. {row['category']}: {row['mean_final_score']:.4f}")

        return all_results

    def analyze_from_file(self, input_file, individual_output=None, grouped_output=None, alpha=0.5, min_count=30):
        """
        Complete analysis workflow
        """
        print("Starting Final Score Analysis...")
        print(f"Formula: final_score = {alpha} * true_label + {1-alpha} * ensemble_prediction")
        print(f"Target categories: {len(self.target_categories)}")
        print(f"Minimum count threshold: {min_count}")

        # Perform analysis
        individual_df, grouped_df = self.calculate_final_scores(input_file, individual_output, alpha)

        if grouped_df is not None and grouped_output:
            self.save_grouped_results(grouped_df, grouped_output)

        return individual_df, grouped_df


# Example usage and testing
if __name__ == "__main__":
    # Initialize analyzer
    analyzer = FinalScoreAnalyzer(alpha=0.5)

    print("Final Score Analyzer")
    print("="*50)
    print(f"Formula: final_score = a * true_label + (1-a) * ensemble_prediction")
    print(f"Expected values: true_label, ensemble_prediction ∈ {0, 1, 2}")
    print(f"Target categories: {len(analyzer.target_categories)}")

    print("\n" + "="*50)
    print("USAGE EXAMPLES:")
    print("="*50)

    print("\n1. Basic analysis with default alpha=0.5:")
    print("analyzer.analyze_from_file('your_data.csv')")

    print("\n2. Analysis with custom parameters:")
    print("analyzer.analyze_from_file(")
    print("    'your_data.csv',")
    print("    individual_output='detailed_scores.csv',")
    print("    grouped_output='category_rankings.csv',")
    print("    alpha=0.7,      # Give more weight to true_label")
    print("    min_count=50    # Only show categories with 50+ occurrences")
    print(")")

    print("\n3. Compare different alpha values:")
    print("results = analyzer.analyze_with_different_alphas(")
    print("    'your_data.csv',")
    print("    alphas=[0.3, 0.5, 0.7]")
    print(")")

    print("\nExpected input columns:")
    print("- Matched_Categories: categories found (separated by |)")
    print("- true_label: ground truth value (0, 1, or 2)")
    print("- ensemble_prediction: model prediction (0, 1, or 2)")

    print("\nOutput: Categories ranked by mean final score!")

Final Score Analyzer
Formula: final_score = a * true_label + (1-a) * ensemble_prediction
Expected values: true_label, ensemble_prediction ∈ (0, 1, 2)
Target categories: 18

USAGE EXAMPLES:

1. Basic analysis with default alpha=0.5:
analyzer.analyze_from_file('your_data.csv')

2. Analysis with custom parameters:
analyzer.analyze_from_file(
    'your_data.csv',
    individual_output='detailed_scores.csv',
    grouped_output='category_rankings.csv',
    alpha=0.7,      # Give more weight to true_label
    min_count=50    # Only show categories with 50+ occurrences
)

3. Compare different alpha values:
results = analyzer.analyze_with_different_alphas(
    'your_data.csv',
    alphas=[0.3, 0.5, 0.7]
)

Expected input columns:
- Matched_Categories: categories found (separated by |)
- true_label: ground truth value (0, 1, or 2)
- ensemble_prediction: model prediction (0, 1, or 2)

Output: Categories ranked by mean final score!


In [18]:

# Initialize with default alpha=0.5 (equal weighting)
#analyzer = FinalScoreAnalyzer(alpha=0.5)

analyzer.analyze_from_file(
    'datastore/google_reviews_adaptive_moe_results_classified.csv',
    individual_output='result/food_detailed_scores.csv',
    grouped_output='result/food_category_rankings.csv',
    alpha=0.5,      # Weight for true_label
    min_count=50    # Only show categories with 50+ occurrences
)

# Test different weightings
results = analyzer.analyze_with_different_alphas(
    'datastore/google_reviews_adaptive_moe_results_classified.csv',
    alphas=[0.1,0.3 , 0.5, 0.7, 0.9]
)

Starting Final Score Analysis...
Formula: final_score = 0.5 * true_label + 0.5 * ensemble_prediction
Target categories: 18
Minimum count threshold: 50
Loaded 29412 records from datastore/google_reviews_adaptive_moe_results_classified.csv
Using alpha = 0.5 (weight for true_label)
Individual results saved to result/food_detailed_scores.csv

FINAL SCORE ANALYSIS - GROUPED BY CATEGORIES

FINAL SCORE RANKINGS (Alpha = 0.5, Min Count = 30):
----------------------------------------------------------------------------------------------------
Rank  Category                                      Count    Mean Score   Min      Max      Std     
----------------------------------------------------------------------------------------------------
1     jungle bites                                  78       1.9744       1.50     2.00     0.1110  
2     planet yen                                    287      1.8240       0.00     2.00     0.4258  
3     discovery food court                          103 