In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from torch.optim.lr_scheduler import StepLR


In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig
import torch.nn as nn
model_name = "michellejieli/emotion_text_classifier"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = 6

tokenizer = AutoTokenizer.from_pretrained("michellejieli/emotion_text_classifier")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True
)


In [168]:
# from transformers import BertForSequenceClassification

# # Rebuild model with new head for 6 outputs
# from transformers import BertConfig

# new_config = BertConfig.from_pretrained("michellejieli/emotion_text_classifier")
# new_config.num_labels = 6  # new number of emotion classes

# model = BertForSequenceClassification.from_pretrained(
#     "michellejieli/emotion_text_classifier",
#     config=new_config,
#     ignore_mismatched_sizes=True 
# )

In [169]:
import torch
import torchvision.models as models

print(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [170]:
#Test how many outputs in the last layer- last layer was modified, changed from 7 to 6 outputs
import torch
# Test with a dummy input
dummy_text = "This is a test sentence."
inputs = tokenizer(dummy_text, return_tensors="pt", padding=True, truncation=True)

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

print("Logits shape:", outputs.logits.shape)  # Should be (1, 6) for a single input
print(outputs.logits)  # Logits for the 6 classes


Logits shape: torch.Size([1, 6])
tensor([[-0.4276, -0.2936, -0.5681,  0.1881, -0.3308,  0.2326]])


In [171]:
# #Cleaning text of dataset
# # Load dataset
# df_combined = pd.read_csv(
#     "final_df_Disgust.csv",
#     encoding="utf-8",
#     encoding_errors='ignore',
#     low_memory=False,
#     dtype={"label": int}
# )

# df = df_combined[['text', 'label']]
# df['text'] = df['text'].str.replace(r"[^/w/s']", "", regex=True)

In [172]:
# print(df_combined["label"].unique())

In [173]:
# from sklearn.utils import resample

# #df = pd.read_csv("final_df_Disgust.csv", encoding='latin1')  # Adjust encoding if needed

# # Define emotion mapping (ensure labels match your dataset)
# emotionMapping = {
#     0: 'anger',
#     1: 'fear',
#     2: 'happy',
#     3: 'sad',
#     4: 'Disgust',
#     5: 'neutral'
# }

# # Check current class distribution
# print("Original class distribution:")
# print(df['label'].value_counts().sort_index().rename(emotionMapping))

# # --- Step 1: Separate Classes ---
# happy = df[df['label'] == 2]  # 'happy'
# sad = df[df['label'] == 3]    # 'sad'
# anger = df[df['label'] == 0]  # 'anger'
# fear = df[df['label'] == 1]   # 'fear'
# disgust = df[df['label'] == 4]  # 'Disgust'
# neutral = df[df['label'] == 5]  # 'neutral'

# # --- Step 2: Hybrid Balancing ---

# happy_downsampled = resample(
#     happy,
#     replace=False,          # No replacement (undersampling)
#     n_samples=16971,      # Target size
#     random_state=42         # Reproducibility
# )

# sad_downsampled = resample(
#     sad,
#     replace=False,
#     n_samples=16971,
#     random_state=42
# )

# disgust_oversampled = resample(
#     disgust,
#     replace=True,           # Allow replacement (oversampling)
#     n_samples=16971,
#     random_state=42
# )

# neutral_oversampled = resample(
#     neutral,
#     replace=True,
#     n_samples=16971,
#     random_state=42
# )

# # --- Step 3: Combine Balanced Classes --
# balanced_df = pd.concat([
#     happy_downsampled,
#     sad_downsampled,
#     anger,
#     fear,
#     disgust_oversampled,
#     neutral_oversampled
# ])

# # Shuffle the dataset to avoid order bias
# balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# # --- Step 4: Verify New Distribution ---
# print("\nBalanced class distribution:")
# print(balanced_df['label'].value_counts().sort_index().rename(emotionMapping))

# # --- Step 5: Save Balanced Dataset (Optional) ---
# balanced_df.to_csv("balanced_dataset.csv", index=False)
# print("\nBalanced dataset saved to 'balanced_dataset_final_with_Disgust.csv'")

In [None]:
import pandas as pd
df = pd.read_csv("ConcData_Labelled.csv")
df = df[df['label'] != 6]
print(df['label'].value_counts())
df.shape

label
3    175621
5    121187
0     57317
2     47712
4     29359
1     28841
Name: count, dtype: int64


(460037, 2)

In [199]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

Using device: cuda


In [200]:
print(torch.cuda.get_device_name(0))
print(torch.cuda.memory_allocated())

NVIDIA GeForce RTX 3090
423734272


### Freezing unrequired layers

In [201]:
for param in model.base_model.parameters():
    param.requires_grad = False


In [None]:
def train_sentiment_analysis_model(df_combined, model_1, tokenizer_1, epochs=10):
    df_combined = df_combined.dropna(subset=["text"])
    texts = df_combined["text"].astype(str).tolist()
    encoded_data = tokenizer_1(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    labels = torch.tensor(df_combined["label"].values, dtype=torch.long)
    dataset = TensorDataset(encoded_data["input_ids"], encoded_data["attention_mask"], labels)

    train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=16, shuffle=False)
    val_loader = DataLoader(val_data, batch_size=16, shuffle=False)

    model = model_1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0003) #changed from 5e-5 to 1e-4, and from AdamW to Adam

    # Training loop
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_preds, train_labels = [], []

        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())  # Predicted labels
            train_labels.extend(labels.cpu().numpy())  # Actual labels

        train_accuracy = accuracy_score(train_labels, train_preds)
        train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(train_labels, train_preds, average='weighted')

        # Validation
        model.eval()
        val_loss = 0
        val_preds, val_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                val_preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_accuracy = accuracy_score(val_labels, val_preds)
        val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"   Train Loss: {train_loss / len(train_loader):.4f}, Train Acc: {train_accuracy:.4f}, Train F1: {train_f1:.4f}")
        print(f"   Val Loss: {val_loss / len(val_loader):.4f}, Val Acc: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")
        print(f"   Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}")
        print(f"   Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}")
        print("-" * 60)



In [203]:
# Train model with improved metrics
train_sentiment_analysis_model(df, model, tokenizer, epochs=12)

Epoch 1/12:
   Train Loss: 0.2499, Train Acc: 0.9100, Train F1: 0.9099
   Val Loss: 0.1984, Val Acc: 0.9263, Val F1: 0.9268
   Train Precision: 0.9099, Train Recall: 0.9100
   Val Precision: 0.9284, Val Recall: 0.9263
------------------------------------------------------------
Epoch 2/12:
   Train Loss: 0.2420, Train Acc: 0.9123, Train F1: 0.9123
   Val Loss: 0.1882, Val Acc: 0.9305, Val F1: 0.9307
   Train Precision: 0.9122, Train Recall: 0.9123
   Val Precision: 0.9317, Val Recall: 0.9305
------------------------------------------------------------
Epoch 3/12:
   Train Loss: 0.2381, Train Acc: 0.9131, Train F1: 0.9130
   Val Loss: 0.1849, Val Acc: 0.9307, Val F1: 0.9311
   Train Precision: 0.9129, Train Recall: 0.9131
   Val Precision: 0.9322, Val Recall: 0.9307
------------------------------------------------------------
Epoch 4/12:
   Train Loss: 0.2357, Train Acc: 0.9142, Train F1: 0.9141
   Val Loss: 0.1796, Val Acc: 0.9323, Val F1: 0.9327
   Train Precision: 0.9141, Train Recal

In [None]:
# # Save the trained model to a file
# model_save_path = "Classifier_withNoSurprise_FineTuned.pth"
# torch.save(model.state_dict(), model_save_path)
# print(f"Newest Model with correct mapping saved to {model_save_path}")

Newest Model with correct mapping saved to Classifier_withNoSurprise_FineTuned.pth


In [396]:
#To load model
model_name="michellejieli/emotion_text_classifier"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = 6
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True
)
model.load_state_dict(torch.load("Classifier_withNoSurprise_FineTuned.pth"))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at michellejieli/emotion_text_classifier and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("Classifier_withNoSurprise_FineTuned.pth"))


<All keys matched successfully>

In [397]:
emotion_labels = {0:"anger", 1:"Disgust",2:"fear",3: "happy",4: "Neutral", 5: "sadness"}

def predict_emotion(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Get the corresponding emotion label
    # predicted_emotion = emotion_labels[predicted_class]
   
    return predicted_class, logits

In [407]:
TEXT = "i went to the store and bought some MiLk"

predClass, logits = predict_emotion(TEXT)

In [408]:
logits.numpy()

array([[  1.0230407 , -10.251837  ,  -0.86612785,   1.1279086 ,
          1.8718865 ,  -0.75101817]], dtype=float32)

In [None]:
emotion_labels[predClass]

'Neutral'

: 