In [1]:
!pip install -U transformers accelerate --quiet


In [2]:
import pandas as pd

# Load datasets
train_df = pd.read_csv('/content/drive/MyDrive/archive/twitter_training.csv', header=None)
val_df = pd.read_csv('/content/drive/MyDrive/archive/twitter_validation.csv', header=None)

# Display sample rows
print(train_df.head())
print(val_df.head())

      0            1         2  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                   3  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
      0          1           2  \
0  3364   Facebook  Irrelevant   
1   352     Amazon     Neutral   
2  8312  Microsoft    Negative   
3  4371      CS-GO    Negative   
4  4433     Google     Neutral   

                                                   3  
0  I mentioned on Facebook that I was struggling ...  
1  BBC News - Amazon boss Jeff Bezos rejects clai...  
2  @Microsoft Why do I pay for WORD when it funct...  
3  CSGO matchmaking is so full of closet

In [3]:
train_df.columns = ['id', 'entity', 'sentiment', 'text']
val_df.columns = ['id', 'entity', 'sentiment', 'text']

In [4]:
pip install nltk



In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')


# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Remove URLs, mentions, punctuation
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^A-Za-z\s]", "", text)
    text = text.lower().strip()

    # Tokenize
    words = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return " ".join(cleaned)

train_df['clean_text'] = train_df['text'].apply(clean_text)
val_df['clean_text'] = val_df['text'].apply(clean_text)

# Normalize sentiment values
train_df['sentiment'] = train_df['sentiment'].str.lower().str.strip()
val_df['sentiment'] = val_df['sentiment'].str.lower().str.strip()

# Map to numeric labels
label_map = {'positive': 0, 'negative': 1, 'neutral': 2, 'irrelevant': 3}
train_df['label'] = train_df['sentiment'].map(label_map)
val_df['label'] = val_df['sentiment'].map(label_map)

# Drop invalid rows
train_df = train_df.dropna(subset=['label'])
val_df = val_df.dropna(subset=['label'])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(train_df['clean_text']), truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(list(val_df['clean_text']), truncation=True, padding=True, return_tensors='pt')

import torch

train_labels = torch.tensor(train_df['label'].values)
val_labels = torch.tensor(val_df['label'].values)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
from torch.utils.data import Dataset

class TwitterDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = TwitterDataset(train_encodings, train_labels)
val_dataset = TwitterDataset(val_encodings, val_labels)

In [9]:
import transformers
print(transformers.__version__)


4.54.1


In [11]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  # Changed from 'evaluation_strategy'
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marjunmenon21102003[0m ([33marjunmenon21102003-go[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.7761,0.281535


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.7761,0.281535
2,0.2806,0.152226


TrainOutput(global_step=9336, training_loss=0.5790159923200844, metrics={'train_runtime': 4461.4985, 'train_samples_per_second': 33.478, 'train_steps_per_second': 2.093, 'total_flos': 1.2741805103107392e+16, 'train_loss': 0.5790159923200844, 'epoch': 2.0})

In [12]:
preds_output = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(preds_output.predictions), axis=1)

from sklearn.metrics import classification_report

print(classification_report(val_labels, preds, target_names=label_map.keys()))


  return forward_call(*args, **kwargs)


              precision    recall  f1-score   support

    positive       0.95      0.96      0.96       277
    negative       0.97      0.98      0.97       266
     neutral       0.96      0.95      0.96       285
  irrelevant       0.95      0.94      0.95       172

    accuracy                           0.96      1000
   macro avg       0.96      0.96      0.96      1000
weighted avg       0.96      0.96      0.96      1000



In [20]:
def predict_sentiment(text):
    text = clean_text(text)
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    tokens = {k: v.to(model.device) for k, v in tokens.items()}
    with torch.no_grad():
        output = model(**tokens)
        pred = torch.argmax(output.logits, dim=1).item()
    return list(label_map.keys())[pred]

# Example
print(predict_sentiment("I absolutely love this product!"))  # ➜ positive
print(predict_sentiment("This is the worst ever."))          # ➜ negative
print(predict_sentiment("I am coming to the borders and I will kill you all"))
print(predict_sentiment("i am sad"))
print(predict_sentiment("i am angry"))
print(predict_sentiment("nice shirt"))
print(predict_sentiment("bbsusuwu"))


positive
negative
positive
negative
negative
positive
irrelevant
