Example code for Task 5: Model Iterations.
This script demonstrates how to:
1. Load a CSV file containing NLP features (from Task 4).
2. Parse columns such as TF-IDF, sentiment scores, custom embeddings, etc.
3. Encode the target emotion labels.
4. Train and evaluate:
   - Logistic Regression
   - Naive Bayes
   - LSTM (Keras/TensorFlow)
   - RNN (Keras/TensorFlow)
   - Two Transformer models (via Hugging Face)

You should adapt paths, filenames, column names, and any language-specific libraries
to match your actual data and environment.

Ensure you record each model's details (features used, hyperparameters, metrics)
in your model iteration file (required by Task 5).

---
## INSTALL REQUIRED LIBRARIES

In [1]:
# !pip install scikit-learn numpy pandas tensorflow torch transformers sentencepiece gensim datasets

import numpy as np
import pandas as pd

# Sklearn for classic models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from tensorflow.keras.callbacks import EarlyStopping
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from gensim.models import Word2Vec
import nltk
from tqdm import tqdm

# TensorFlow/Keras for LSTM, RNN
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout, Input, Add, Bidirectional

# Hugging Face Transformers
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from transformers import DataCollatorWithPadding
import transformers
import keras

NUM_TEST_SAMPLES = 1044
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

2025-04-09 13:02:32.785649: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-09 13:02:32.824282: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-09 13:02:32.824315: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-09 13:02:32.825796: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 13:02:32.832447: I tensorflow/core/platform/cpu_feature_guar

In [5]:
from transformers import Trainer, TrainingArguments, RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import Dataset, DatasetDict
from transformers import EarlyStoppingCallback
import evaluate

df = pd.read_parquet('group_combined_no_features.parquet')
print(f"DataFrame shape after dropping NaN values: {df.shape}")
print(f"DataFrame columns: {df.columns.tolist()}")

label_encoder = LabelEncoder()
df["emotion_encoded"] = label_encoder.fit_transform(df["general_emotion"])
classes_ = label_encoder.classes_
num_classes = len(classes_)
print("Emotion classes:", classes_)

dataframe_4_bert = df[['text', 'emotion_encoded']]

test_df = dataframe_4_bert[-NUM_TEST_SAMPLES:].reset_index(drop=True)

print("Test shape:", test_df.shape)

# Convert DataFrame to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)

# Combine into DatasetDict
dataset = DatasetDict({
    'test': test_dataset
})

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("distilbert/distilroberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize dataset with progress bar (using tqdm)
tokenized_datasets = dataset.map(tokenize_function, batched=True, desc="Tokenizing")

# Convert dataset format
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask"])

DataFrame shape after dropping NaN values: (87267, 3)
DataFrame columns: ['ru_text', 'text', 'general_emotion']
Emotion classes: ['anger' 'disgust' 'fear' 'happiness' 'neutral' 'sadness' 'surprise']
Test shape: (1044, 2)


Tokenizing:   0%|          | 0/1044 [00:00<?, ? examples/s]

In [6]:
len(tokenized_datasets['test'][12]['input_ids'])

512

In [7]:
# Load your fine-tuned model
model = RobertaForSequenceClassification.from_pretrained("./distilroberta_finetuned_v2")

# Use the Trainer for prediction
trainer = Trainer(model=model)
trainer.args.report_to = "none"  # Disable logging to avoid unnecessary output

# Predict
predictions = trainer.predict(tokenized_datasets["test"])

In [8]:
# Classification report
from sklearn.metrics import classification_report
import numpy as np

# Get predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids
print(classification_report(true_labels, preds, target_names=classes_))

# Confsion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(true_labels, preds)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=classes_, yticklabels=classes_)

ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [emotion_encoded, text]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

## STEP 12: COLLECT AND PRINT FINAL RESULTS

In [None]:
print("\n=== FINAL SUMMARY ===")
print(f"Logistic Regression F1: {f1_lr:.4f}")
print(f"Naive Bayes F1: {f1_nb:.4f}")
print(f"LSTM F1: {f1_lstm:.4f}")
print(f"RNN F1: {f1_rnn:.4f}")
print(f"BERT F1: {eval_results_bert['eval_f1_macro']:.4f}")
print(f"DistilBERT F1: {eval_results_distil['eval_f1_macro']:.4f}")

"""
NOTE: 
1. For each model/iteration, record details (features used, hyperparameters, F1-score, comments) 
   in your model iteration file.
2. Feel free to adjust hyperparameters (batch size, number of epochs, etc.) to improve results.
3. You may also incorporate more features (POS tags, Pretrained_Embeddings, etc.) 
   by modifying the make_feature_vector() function or building advanced architectures.
4. This example focuses on demonstrating the core steps; you must tailor it to your specific project setup.
"""