In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Twitter Tweets Sentiment Analysis for Natural Language Processing

In [None]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Download required nltk data (run once)
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# import the data
tweet = pd.read_csv('/content/drive/MyDrive/Tweets.csv', encoding = 'latin')
tweet.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


# Data Exploration, Feature Extraction & Splitting

In [None]:
# remove duplicates and missing rows
tweet.dropna(inplace = True)

In [None]:
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    text_lower = str(text).lower()  # Convert to lowercase
    # tokens = word_tokenize(text_lower)  # Tokenize
    # filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]  # Remove stopwords
    return text_lower

def process_text_column(df, column_name):
    df['cleaned_text'] = df[column_name].apply(clean_text)
    return df

In [None]:
tweet = process_text_column(tweet, 'selected_text')
tweet.head()

Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"sons of ****,"


In [None]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(tweet['cleaned_text'].astype(str))
y = tweet['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

# Build & Train a Sentiment Analysis Model

In [None]:
# logistic regression
lr = LogisticRegression()

lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [None]:
lr_pred

array(['positive', 'neutral', 'neutral', ..., 'neutral', 'neutral',
       'neutral'], dtype=object)

In [None]:
classification_report(y_test, lr_pred)

'              precision    recall  f1-score   support\n\n    negative       0.80      0.79      0.80      1572\n     neutral       0.79      0.87      0.83      2236\n    positive       0.91      0.80      0.85      1688\n\n    accuracy                           0.83      5496\n   macro avg       0.84      0.82      0.83      5496\nweighted avg       0.83      0.83      0.83      5496\n'

In [None]:
# support vector machine
svm = SVC()

svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

In [None]:
svm_pred

array(['positive', 'neutral', 'neutral', ..., 'neutral', 'negative',
       'neutral'], dtype=object)

In [None]:
classification_report(y_test, svm_pred)

'              precision    recall  f1-score   support\n\n    negative       0.84      0.78      0.81      1572\n     neutral       0.78      0.92      0.85      2236\n    positive       0.95      0.79      0.86      1688\n\n    accuracy                           0.84      5496\n   macro avg       0.86      0.83      0.84      5496\nweighted avg       0.85      0.84      0.84      5496\n'

In [None]:
accuracy_score(y_test, svm_pred)

0.8400655021834061

# Model Training with Transformer Based Model (BERT & RoBERTa)

In [None]:
# install transformers library
!pip install transformers



In [None]:
from transformers import BertTokenizer
import torch
from datasets import Dataset, load_dataset
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Data Preprocessing

In [None]:
# create a dictionary and map sentiments
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
tweet['label'] = tweet['sentiment'].map(label_mapping)

In [None]:
# split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    tweet['selected_text'], tweet['label'], test_size=0.2, stratify=tweet['label'], random_state=42)

In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # 3 labels: negative, neutral, positive

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the dataset
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {'labels': torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, list(train_labels))
val_dataset = SentimentDataset(val_encodings, list(val_labels))

In [None]:
training_args = TrainingArguments(
    output_dir='./results-twitter',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33majanakubenedict28[0m ([33majanakubenedict28-data-science-nigeria[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.2573,0.346712
2,0.2157,0.344812
3,0.1419,0.423383


TrainOutput(global_step=4122, training_loss=0.24224114484546372, metrics={'train_runtime': 2065.2711, 'train_samples_per_second': 31.934, 'train_steps_per_second': 1.996, 'total_flos': 3728152683319680.0, 'train_loss': 0.24224114484546372, 'epoch': 3.0})

In [None]:
save_path = '/content/drive/MyDrive/Tweets Sentiment Analysis/' # save directory path
model.save_pretrained(save_path) # save model and tokenizer
tokenizer.save_pretrained(save_path)

('/content/drive/MyDrive/Tweets Sentiment Analysis/tokenizer_config.json',
 '/content/drive/MyDrive/Tweets Sentiment Analysis/special_tokens_map.json',
 '/content/drive/MyDrive/Tweets Sentiment Analysis/vocab.txt',
 '/content/drive/MyDrive/Tweets Sentiment Analysis/added_tokens.json')

In [None]:
evaluate = trainer.evaluate() # evaluate the model
print(evaluate)

{'eval_loss': 0.42338311672210693, 'eval_runtime': 25.2834, 'eval_samples_per_second': 217.376, 'eval_steps_per_second': 13.606, 'epoch': 3.0}
