In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


# importing Libraries

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

from transformers import BertModel
import pandas as pd


# Data Loading

In [3]:
df=pd.read_csv('test_nltk.txt',names=['Text','Label'],sep=';')
df

Unnamed: 0,Text,Label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness
...,...,...
1995,i just keep feeling like someone is being unki...,anger
1996,im feeling a little cranky negative after this...,anger
1997,i feel that i am useful to my people and that ...,joy
1998,im feeling more comfortable with derby i feel ...,joy


In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df.Label = encoder.fit_transform(df.Label)

In [5]:
df

Unnamed: 0,Text,Label
0,im feeling rather rotten so im not very ambiti...,4
1,im updating my blog because i feel shitty,4
2,i never make her separate from me because i do...,4
3,i left with my bouquet of red and yellow tulip...,2
4,i was feeling a little vain when i did this one,4
...,...,...
1995,i just keep feeling like someone is being unki...,0
1996,im feeling a little cranky negative after this...,0
1997,i feel that i am useful to my people and that ...,2
1998,im feeling more comfortable with derby i feel ...,2


In [6]:
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split

# Splitting Data

In [7]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Preprocessing data

In [8]:
class preprocess(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.iloc[idx]['Text'])
        label = int(self.df.iloc[idx]['Label'])

        # Tokenize the text
        encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_classes=6

model= BertModel.from_pretrained('bert-base-uncased')

In [15]:
train_dataset[1]['label']

tensor(4)

In [11]:
max_length = 50
train_dataset = preprocess(train_df, tokenizer, max_length)
label_dataset = preprocess(val_df, tokenizer, max_length)

# DataLoading

In [36]:
batch_size=16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
label_dataloader = DataLoader(label_dataset, batch_size=batch_size, shuffle=False)

In [37]:
from torch import nn


# Training Function

In [49]:
def train(model,data_loader, optimizer, scheduler):
    model.train() 
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [50]:
num_classes = 6
max_length = 50
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [40]:
from transformers import get_linear_schedule_with_warmup

In [51]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [52]:
class MyModel(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(MyModel, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.last_hidden_state[:, 0, :]) 
        return logits


In [53]:
num_classes=6

In [54]:
model=MyModel('bert-base-uncased',num_classes)

In [55]:
from transformers import get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [56]:
train(model,train_dataloader,optimizer,scheduler)

# Predict function

In [75]:
# def predict(text):
#     encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
   
#     input_ids = encoding['input_ids']
#     attention_mask = encoding['attention_mask']

#     predict= model(input_ids, attention_mask)
#     last_hidden_states = predict.last_hidden_state
#     cls_embeddings = last_hidden_states[:,0 ,:]
#     classification_layer = torch.nn.Linear(model.config.hidden_size, num_classes)
#     logits = classification_layer(cls_embeddings)
#     probabilities = torch.nn.functional.softmax(logits, dim=-1)
#     predicted_class = torch.argmax(probabilities, dim=-1)
#     print(predicted_class)

In [66]:
def predict_new(text):
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
   
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return preds
    

# Prediction

In [67]:
test_text = "im updating my blog because i feel shitty"
sentiment = predict_new(test_text)
sentiment

tensor([4])

In [74]:
#['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
#array([4, 0, 3, 5, 1, 2])