### Student Information
Name: 朱晉輝 Chin-Hui Chu

Student ID: r12944041

GitHub ID: CHINHUICHU

Kaggle name: Chin Hui Chu

Kaggle private scoreboard snapshot:![kaggle](kaggle_ranking.png)



---

### Instructions

1. First: __This part is worth 30% of your grade.__ Do the **take home exercises** in the [DM2024-Lab2-master Repo](https://github.com/didiersalazar/DM2024-Lab2-Master). You may need to copy some cells from the Lab notebook to this notebook. 


2. Second: __This part is worth 30% of your grade.__ Participate in the in-class [Kaggle Competition](https://www.kaggle.com/competitions/dm-2024-isa-5810-lab-2-homework) regarding Emotion Recognition on Twitter by this link: https://www.kaggle.com/competitions/dm-2024-isa-5810-lab-2-homework. The scoring will be given according to your place in the Private Leaderboard ranking: 
    - **Bottom 40%**: Get 20% of the 30% available for this section.

    - **Top 41% - 100%**: Get (0.6N + 1 - x) / (0.6N) * 10 + 20 points, where N is the total number of participants, and x is your rank. (ie. If there are 100 participants and you rank 3rd your score will be (0.6 * 100 + 1 - 3) / (0.6 * 100) * 10 + 20 = 29.67% out of 30%.)   
    Submit your last submission **BEFORE the deadline (Nov. 26th, 11:59 pm, Tuesday)**. Make sure to take a screenshot of your position at the end of the competition and store it as '''pic0.png''' under the **img** folder of this repository and rerun the cell **Student Information**.
    

3. Third: __This part is worth 30% of your grade.__ A report of your work developing the model for the competition (You can use code and comment on it). This report should include what your preprocessing steps, the feature engineering steps and an explanation of your model. You can also mention different things you tried and insights you gained. 


4. Fourth: __This part is worth 10% of your grade.__ It's hard for us to follow if your code is messy :'(, so please **tidy up your notebook**.


Upload your files to your repository then submit the link to it on the corresponding e-learn assignment.

Make sure to commit and save your changes to your repository __BEFORE the deadline (Nov. 26th, 11:59 pm, Tuesday)__. 

In [2]:
### Begin Assignment Here
class Config:
    """Configuration settings for the emotion classification project"""
    MODEL_NAME = "roberta-base"
    NUM_LABELS = 8
    MAX_LENGTH = 128
    BATCH_SIZE = 256
    NUM_EPOCHS = 1
    LEARNING_RATE = 2e-5
    SEED = 42
    OUTPUT_DIR = "/content/drive/MyDrive/dm-2024/roberta_emotion_results"

    EMOTION_MAPPING = {
        'anger': 0,
        'anticipation': 1,
        'disgust': 2,
        'fear': 3,
        'sadness': 4,
        'surprise': 5,
        'trust': 6,
        'joy': 7
    }

In [None]:
import pandas as pd
import json
import re
import emoji
from typing import Tuple
from datasets import Dataset

class DataProcessor:
    @staticmethod
    def read_csv_file(file_path: str, **kwargs) -> pd.DataFrame:
        """Read and load CSV file"""
        try:
            return pd.read_csv(file_path, **kwargs)
        except FileNotFoundError:
            print(f"Error: File '{file_path}' not found")
        except Exception as e:
            print(f"Error reading CSV file: {str(e)}")
        return None

    @staticmethod
    def read_json_file(file_path: str) -> list:
        """Read and process JSON file"""
        try:
            data = []
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    if line.strip():
                        try:
                            tweet_data = json.loads(line.strip())
                            processed_tweet = {
                                'score': tweet_data.get('_score'),
                                'index': tweet_data.get('_index'),
                                'crawl_date': tweet_data.get('_crawldate'),
                                'type': tweet_data.get('_type'),
                                'tweet_id': tweet_data.get('_source', {}).get('tweet', {}).get('tweet_id'),
                                'text': tweet_data.get('_source', {}).get('tweet', {}).get('text'),
                                'hashtags': tweet_data.get('_source', {}).get('tweet', {}).get('hashtags', [])
                            }
                            data.append(processed_tweet)
                        except json.JSONDecodeError as e:
                            print(f"Error parsing JSON object: {str(e)}")
                            continue
            return data
        except Exception as e:
            print(f"Error reading JSON file: {str(e)}")
        return None

    @staticmethod
    def clean_text(text: str) -> str:
        """Clean and process text data"""
        try:
            decoded_text = text.encode().decode('unicode-escape')
            decoded_text = emoji.demojize(decoded_text)
        except (UnicodeError, AttributeError):
            decoded_text = text

        decoded_text = re.sub(r'\\u[0-9a-fA-F]{4}', '', decoded_text)
        return decoded_text.strip()

    @staticmethod
    def merge_text_hashtags(row) -> str:
        """Merge text and hashtags"""
        text = row['text'] if isinstance(row['text'], str) else ''
        hashtags = row['hashtags'] if isinstance(row['hashtags'], list) else []

        cleaned_text = DataProcessor.clean_text(text)

        for hashtag in hashtags:
            hashtag_text = f"#{hashtag}"
            if hashtag_text not in cleaned_text:
                cleaned_text += f" {hashtag_text}"

        return cleaned_text.strip()

    @classmethod
    def prepare_dataset(cls, df: pd.DataFrame, is_training: bool = True) -> pd.DataFrame:
        """Prepare dataset for training or testing"""
        processed_df = df.copy()
        processed_df['text'] = processed_df.apply(cls.merge_text_hashtags, axis=1)

        if is_training:
            return processed_df[['tweet_id', 'text', 'emotion']]
        return processed_df[['tweet_id', 'text']]

In [None]:
import torch
from transformers import (
    AutoConfig,
    RobertaTokenizer,
    RobertaForSequenceClassification,
    DataCollatorWithPadding
)
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

class ModelUtils:
    @staticmethod
    def calculate_metrics(predictions, labels):
        """Calculate model metrics"""
        f1_per_class = f1_score(labels, predictions, average=None)
        mean_f1 = f1_score(labels, predictions, average='macro')
        return {
            'mean_f1': mean_f1,
            'f1_per_class': f1_per_class
        }

    @staticmethod
    def plot_metrics(train_losses, mean_f1_scores, output_dir):
        """Plot training metrics"""
        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        plt.plot(train_losses, label='Training Loss')
        plt.title('Training Loss over Steps')
        plt.xlabel('Step')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(mean_f1_scores, label='Mean F1 Score')
        plt.title('Mean F1 Score over Epochs')
        plt.xlabel('Epoch')
        plt.ylabel('Score')
        plt.legend()

        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'training_metrics.png'))
        plt.close()

    @staticmethod
    def prepare_data_for_model(train_df, test_df, tokenizer, max_length=128):
        """Prepare data for model training"""
        train_dict = {
            'text': train_df['text'].tolist(),
            'labels': train_df['emotion'].map(Config.EMOTION_MAPPING).tolist(),
            'tweet_id': train_df['tweet_id'].tolist()
        }

        test_dict = {
            'text': test_df['text'].tolist(),
            'tweet_id': test_df['tweet_id'].tolist()
        }
        if 'emotion' in test_df.columns:
            test_dict['labels'] = test_df['emotion'].map(Config.EMOTION_MAPPING).tolist()

        train_dataset = Dataset.from_dict(train_dict)
        test_dataset = Dataset.from_dict(test_dict)

        def tokenize_function(examples):
            return tokenizer(
                examples['text'],
                padding='max_length',
                truncation=True,
                max_length=max_length
            )

        train_remove_cols = ['text', 'tweet_id']
        test_remove_cols = ['text', 'tweet_id'] if 'emotion' not in test_df.columns else ['text', 'tweet_id']

        train_dataset = train_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=train_remove_cols,
            desc="Tokenizing train dataset"
        )

        test_dataset = test_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=test_remove_cols,
            desc="Tokenizing test dataset"
        )

        return train_dataset, test_dataset

In [None]:
import os
import logging
import torch
import numpy as np
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from transformers import (
    AutoConfig,
    RobertaTokenizer,
    RobertaForSequenceClassification,
    DataCollatorWithPadding,
    get_scheduler
)
from sklearn.metrics import f1_score
import pandas as pd
import matplotlib.pyplot as plt

# trainer.py
class EmotionClassificationTrainer:
    def __init__(self, config, accelerator):
        self.config = config
        self.accelerator = accelerator
        self.logger = get_logger(__name__)
        self.setup_model_and_tokenizer()

    def setup_model_and_tokenizer(self):
        self.config_model = AutoConfig.from_pretrained(
            self.config.MODEL_NAME,
            num_labels=self.config.NUM_LABELS,
            finetuning_task="emotion_classification"
        )

        self.tokenizer = RobertaTokenizer.from_pretrained(self.config.MODEL_NAME)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.config_model.pad_token_id = self.tokenizer.pad_token_id

        self.model = RobertaForSequenceClassification.from_pretrained(
            self.config.MODEL_NAME,
            config=self.config_model,
            ignore_mismatched_sizes=True
        )

    def _setup_optimizer_and_scheduler(self, num_training_steps):
        """Setup optimizer and learning rate scheduler"""
        # Optimizer with weight decay
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters()
                          if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.01,
            },
            {
                "params": [p for n, p in self.model.named_parameters()
                          if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                                    lr=self.config.LEARNING_RATE)

        # Learning rate scheduler
        scheduler = get_scheduler(
            name="linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps,
        )

        return optimizer, scheduler

    def _make_predictions(self, eval_dataloader):
        """Make predictions on the evaluation dataset"""
        self.model.eval()
        all_predictions = []
        all_probs = []

        for batch in tqdm(eval_dataloader, desc="Making predictions"):
            with torch.no_grad():
                outputs = self.model(**batch)

            predictions = outputs.logits.argmax(dim=-1)
            probabilities = torch.softmax(outputs.logits, dim=-1)

            predictions, probabilities = self.accelerator.gather(
                (predictions, probabilities))
            all_predictions.extend(predictions.cpu().numpy())
            all_probs.extend(probabilities.cpu().numpy())

        return all_predictions, all_probs

    def _save_checkpoint(self, epoch):
        """Save model checkpoint"""
        if self.accelerator.is_main_process:
            checkpoint_dir = os.path.join(self.config.OUTPUT_DIR, f"checkpoint-{epoch}-50%")
            unwrapped_model = self.accelerator.unwrap_model(self.model)
            unwrapped_model.save_pretrained(
                checkpoint_dir,
                save_function=self.accelerator.save
            )
            self.tokenizer.save_pretrained(checkpoint_dir)

    def _save_predictions(self, test_df, predictions, probabilities):
        """Save predictions and probabilities"""
        if self.accelerator.is_main_process:
            # Save predictions
            test_predictions_df = pd.DataFrame({
                'tweet_id': test_df['tweet_id'],
                'predicted_emotion': predictions,
                'confidence': [max(probs) for probs in probabilities]
            })
            predictions_path = os.path.join(self.config.OUTPUT_DIR,
                                          'test_predictions.csv')
            test_predictions_df.to_csv(predictions_path, index=False)

            # Save probabilities
            probs_df = pd.DataFrame(
                probabilities,
                columns=[f'prob_class_{i}' for i in range(self.config.NUM_LABELS)]
            )
            probs_df['tweet_id'] = test_df['tweet_id']
            probs_path = os.path.join(self.config.OUTPUT_DIR,
                                    'test_probabilities.csv')
            probs_df.to_csv(probs_path, index=False)

            self.logger.info(f"Predictions saved to {predictions_path}")
            self.logger.info(f"Probabilities saved to {probs_path}")

            return test_predictions_df

    def prepare_datasets(self, train_df):
        """Prepare and tokenize datasets, splitting training data into train and validation"""
        from sklearn.model_selection import train_test_split

        # Split training data into train and validation
        train_df, val_df = train_test_split(
            train_df,
            test_size=0.2,  # 20% for validation
            random_state=self.config.SEED
        )

        # Prepare datasets
        train_dataset, val_dataset = ModelUtils.prepare_data_for_model(
            train_df, val_df, self.tokenizer, self.config.MAX_LENGTH
        )

        # Create dataloaders
        data_collator = DataCollatorWithPadding(self.tokenizer)
        train_dataloader = DataLoader(
            train_dataset,
            shuffle=True,
            collate_fn=data_collator,
            batch_size=self.config.BATCH_SIZE
        )
        val_dataloader = DataLoader(
            val_dataset,
            collate_fn=data_collator,
            batch_size=self.config.BATCH_SIZE
        )

        return train_dataloader, val_dataloader

    def train(self, train_dataloader, eval_dataloader):
        """Train the emotion classification model"""
        # Setup optimizer and scheduler
        num_training_steps = self.config.NUM_EPOCHS * len(train_dataloader)
        optimizer, lr_scheduler = self._setup_optimizer_and_scheduler(num_training_steps)

        # Prepare everything with accelerator
        self.model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = \
            self.accelerator.prepare(
                self.model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
            )

        # Initialize metrics tracking
        training_losses = []
        mean_f1_scores = []
        progress_bar = tqdm(range(num_training_steps))

        # Training loop
        for epoch in range(self.config.NUM_EPOCHS):
            self.model.train()
            epoch_loss = 0

            # Training
            for step, batch in enumerate(train_dataloader):
                outputs = self.model(**batch)
                loss = outputs.loss
                self.accelerator.backward(loss)

                epoch_loss += loss.detach().float()
                training_losses.append(loss.detach().float().item())

                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)

            # Save checkpoint
            self._save_checkpoint(epoch)

            # Evaluation
            self.model.eval()
            eval_predictions = []
            eval_labels = []

            for batch in eval_dataloader:
                with torch.no_grad():
                    outputs = self.model(**batch)
                predictions = outputs.logits.argmax(dim=-1)
                predictions, references = self.accelerator.gather(
                    (predictions, batch["labels"])
                )
                eval_predictions.extend(predictions.cpu().numpy())
                eval_labels.extend(references.cpu().numpy())

            # Calculate metrics
            metrics = ModelUtils.calculate_metrics(eval_predictions, eval_labels)
            mean_f1_scores.append(metrics['mean_f1'])

            # Log metrics
            avg_loss = epoch_loss / len(train_dataloader)
            print(f"Epoch {epoch+1}:")
            print(f"  Average loss = {avg_loss:.4f}")
            print(f"  Mean F1 Score = {metrics['mean_f1']:.4f}")
            print("  F1 Score per class:")
            for i, f1 in enumerate(metrics['f1_per_class']):
                print(f"    Class {i}: {f1:.4f}")

        # Return results
        metrics = {
            'training_losses': training_losses,
            'mean_f1_scores': mean_f1_scores,
        }

        return self.model, self.tokenizer, metrics
    
    def predict(self, test_df: pd.DataFrame) -> pd.DataFrame:
        # Prepare test data
        test_text_df = pd.DataFrame({
            'tweet_id': test_df['tweet_id'], 
            'text': test_df['text']
        })
        _, test_dataset = ModelUtils.prepare_data_for_model(
            pd.DataFrame(),  # Empty DataFrame for train_df as it's not needed
            test_text_df, 
            self.tokenizer
        )

        # Create test dataloader
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=self.config.BATCH_SIZE,
            collate_fn=DataCollatorWithPadding(self.tokenizer)
        )

        # Prepare for prediction
        test_dataloader = self.accelerator.prepare(test_dataloader)
        self.model.eval()
        
        # Make predictions
        all_predictions = []
        all_probabilities = []

        with torch.no_grad():
            for batch in tqdm(test_dataloader, desc="Making predictions"):
                outputs = self.model(**batch)
                predictions = outputs.logits.argmax(dim=-1)
                probabilities = torch.softmax(outputs.logits, dim=-1)
                
                # Gather predictions and probabilities from all processes
                predictions, probabilities = self.accelerator.gather((predictions, probabilities))
                
                all_predictions.extend(predictions.cpu().numpy())
                all_probabilities.extend(probabilities.cpu().numpy())

        # Convert numerical predictions back to emotion labels
        inverse_emotion_mapping = {v: k for k, v in self.config.EMOTION_MAPPING.items()}
        emotion_predictions = [inverse_emotion_mapping[pred] for pred in all_predictions]
        
        # Create predictions DataFrame
        predictions_df = pd.DataFrame({
            'tweet_id': test_df['tweet_id'],
            'predicted_emotion': emotion_predictions,
            'confidence': [max(probs) for probs in all_probabilities]
        })

        # Add probability columns for each emotion
        for emotion, idx in self.config.EMOTION_MAPPING.items():
            predictions_df[f'prob_{emotion}'] = [probs[idx] for probs in all_probabilities]

        return predictions_df

In [None]:
import os
import logging
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed

def main():
    # Initialize configuration and accelerator
    config = Config()
    accelerator = Accelerator()
    set_seed(config.SEED)
    logger = get_logger(__name__)
    os.makedirs(config.OUTPUT_DIR, exist_ok=True)

    # Initialize data processor
    data_processor = DataProcessor()

    # Read raw data
    tweets = data_processor.read_json_file('/content/drive/MyDrive/dm-2024/tweets_DM.json')
    emotions = data_processor.read_csv_file('/content/drive/MyDrive/dm-2024/emotion.csv')
    identification = data_processor.read_csv_file('/content/drive/MyDrive/dm-2024/data_identification.csv')

    # Process data
    tweets_df = pd.DataFrame(tweets)
    merged_df = pd.merge(
        tweets_df,
        identification,
        left_on='tweet_id',
        right_on='tweet_id',
        how='inner'
    )

    # Split data
    train_df = merged_df[merged_df['identification'] == 'train']
    test_df = merged_df[merged_df['identification'] == 'test']

    # Merge with emotions for training data
    train_df = pd.merge(
        train_df,
        emotions,
        left_on='tweet_id',
        right_on='tweet_id',
        how='inner'
    )

    # Prepare final datasets
    final_train = data_processor.prepare_dataset(train_df, is_training=True)
    final_test = data_processor.prepare_dataset(test_df, is_training=False)

    # Initialize trainer
    trainer = EmotionClassificationTrainer(config, accelerator)

    # Train model
    train_dataloader, eval_dataloader = trainer.prepare_datasets(final_train)
    trainer.train(train_dataloader, eval_dataloader)

    # Make predictions
    predictions_df = trainer.predict(final_test)
    predictions_df.to_csv(os.path.join(config.OUTPUT_DIR, 'predictions.csv'), index=False)

if __name__ == "__main__":
    main()