<a href="https://colab.research.google.com/github/24prady02/Prady-repository/blob/main/Emotion_Cause_Pedict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from google.colab import files
import pandas as pd

# Step 1: Preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

# Step 2: Emotion-cause pair modeling
def train_emotion_cause_model(preprocessed_texts, emotions):
    emotion_labels = ['anger', 'fear', 'joy', 'sadness', 'sentiment']
    train_texts = preprocessed_texts
    train_labels = [emotion_labels.index(emotion) for emotion in emotions]

    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

    train_input_ids = torch.tensor(train_encodings['input_ids']).cuda()
    train_attention_mask = torch.tensor(train_encodings['attention_mask']).cuda()
    train_labels = torch.tensor(train_labels).cuda()

    val_input_ids = torch.tensor(val_encodings['input_ids']).cuda()
    val_attention_mask = torch.tensor(val_encodings['attention_mask']).cuda()
    val_labels = torch.tensor(val_labels).cuda()

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(emotion_labels)).cuda()

    optimizer = AdamW(model.parameters(), lr=1e-5)
    loss_fn = nn.CrossEntropyLoss()

    train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
    val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_mask, val_labels)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

    model.train()
    for epoch in range(3):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels = labels.cuda()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels = labels.cuda()
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    return model, accuracy

# Step 3: Extract emotion-cause pairs
def extract_emotion_cause_pairs(new_texts, model, tokenizer):
    emotion_labels = ['anger', 'fear', 'joy', 'sadness', 'sentiment']
    new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=512)
    new_input_ids = torch.tensor(new_encodings['input_ids']).cuda()
    new_attention_mask = torch.tensor(new_encodings['attention_mask']).cuda()

    model.eval()
    with torch.no_grad():
        outputs = model(new_input_ids, attention_mask=new_attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)

    predicted_emotions = [emotion_labels[prediction] for prediction in predicted]
    return predicted_emotions

# Step 4: Usage example
uploaded = files.upload()
filename = next(iter(uploaded))
data = pd.read_csv(filename, header=None)
emotions = data[1].tolist()
texts = data[2].tolist()

preprocessed_texts = [preprocess_text(text) for text in texts]
model, accuracy = train_emotion_cause_model(preprocessed_texts, emotions)
print("Model Accuracy:", accuracy)

new_texts = ["I am unhappy because my dog died"]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
predicted_emotions = extract_emotion_cause_pairs(new_texts, model, tokenizer)
print("Predicted Emotions:", predicted_emotions)


Saving eng_dataset.csv to eng_dataset (5).csv


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Model Accuracy: 0.8543279380717804
Predicted Emotions: ['sadness']
