In [3]:
# Import libraries
import pandas as pd
import numpy as np
import re
import os
import torch
from torch import nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from scipy.special import softmax
from bertopic import BERTopic
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import nltk
from torch.optim import Adam
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

ModuleNotFoundError: No module named 'nltk'

In [None]:
# Filter the dataset to include only the topics of interest

# Load your dataset
data = pd.read_csv('tweet_with_topics_sentiment.csv')

# Define the topics of interest
topics_of_interest = [1, 2, 3, 4, 5, 6, 8, 9]

# Filter the dataset 
filtered_data = data[data['topic'].isin(topics_of_interest)]

# Create a mapping from original topics to new encoded values
topic_mapping = {topic: i for i, topic in enumerate(sorted(topics_of_interest))}

# Apply the mapping to the 'topic' column
filtered_data['topic'] = filtered_data['topic'].replace(topic_mapping)

# Save the filtered dataset to a new CSV file
filtered_data.to_csv('filtered_tweet_with_topics_sentiment.csv', index=False)


In [None]:
# topic classification

In [None]:
# Load the dataset
data = pd.read_csv("filtered_tweet_with_topics_sentiment.csv")

# First split: Separate out the test data from the initial dataset
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)  # 20% for testing

# Second split: Divide the training dataset into training and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)  # 25% of 80% for validation

In [None]:
# Custom dataset class for handling tweet data
class TweetDataset(Dataset):
    
    # Initialization with DataFrame, tokenizer, and optional max length
    def __init__(self, dataframe, tokenizer, max_length=150):
        self.tokenizer = tokenizer
        self.texts = dataframe['clean_tweet'].values.tolist()  
        self.labels = dataframe['topic'].values  
        self.max_length = max_length
        
    # Return the total number of items in the dataset
    def __len__(self):
        return len(self.texts)
    
    # Retrieve an item by its index
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Tokenize the text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Prepare item with input IDs and attention mask
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),  
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }
        
        # Include labels if available
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  
        
        return item

    # Print random samples from the dataset to check data
    def _print_random_samples(self, texts):
        import numpy as np
        np.random.seed(42)
        random_entries = np.random.randint(0, len(texts), 5)

        for i in random_entries:
            print(f"Entry {i}: {texts[i]}")
            
    # Comprehensive text preprocessing
    def _preprocess(self, text):
        text = self._remove_amp(text)
        text = self._remove_links(text)
        text = self._remove_hashes(text)
        text = self._remove_retweets(text)
        text = self._remove_mentions(text)
        text = self._remove_multiple_spaces(text)

        #text = self._lowercase(text)
        text = self._remove_punctuation(text)
        #text = self._remove_numbers(text)

        text_tokens = self._tokenize(text)
        text_tokens = self._stopword_filtering(text_tokens)
        #text_tokens = self._stemming(text_tokens)
        text = self._stitch_text_tokens_together(text_tokens)

        return text.strip()

    # Helper methods for preprocessing: remove special characters, links, hashtags, etc.
    def _remove_amp(self, text):
        return text.replace("&amp;", " ")

    def _remove_mentions(self, text):
        return re.sub(r'(@.*?)[\s]', ' ', text)
    
    def _remove_multiple_spaces(self, text):
        return re.sub(r'\s+', ' ', text)

    def _remove_retweets(self, text):
        return re.sub(r'^RT[\s]+', ' ', text)

    def _remove_links(self, text):
        return re.sub(r'https?:\/\/[^\s\n\r]+', ' ', text)

    def _remove_hashes(self, text):
        return re.sub(r'#', ' ', text)

    def _stitch_text_tokens_together(self, text_tokens):
        return " ".join(text_tokens)

    def _tokenize(self, text):
        return nltk.word_tokenize(text, language="english")

    def _stopword_filtering(self, text_tokens):
        stop_words = nltk.corpus.stopwords.words('english')

        return [token for token in text_tokens if token not in stop_words]

    def _stemming(self, text_tokens):
        porter = nltk.stem.porter.PorterStemmer()
        return [porter.stem(token) for token in text_tokens]

    def _remove_numbers(self, text):
        return re.sub(r'\d+', ' ', text)

    def _lowercase(self, text):
        return text.lower()

    def _remove_punctuation(self, text):
        return ''.join(character for character in text if character not in string.punctuation)


In [None]:
# Define the device based on the availability of CUDA (GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TweetClassifier(nn.Module):
    def __init__(self, base_model):
        super(TweetClassifier, self).__init__()
        self.bert = base_model
        self.fc1 = nn.Linear(768, 32)
        self.fc2 = nn.Linear(32, 8)  # Output for 8 classes

        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0][:, 0]
        x = self.fc1(bert_out)
        x = self.relu(x)
        x = self.fc2(x)  
        return x


In [None]:
# Set CUDA operations to be synchronous to facilitate easier debugging of CUDA errors
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
# Set up training: initialize device, loss function, optimizer, and set initial tracking metrics for early stopping.

def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
    best_val_loss = float('inf')
    early_stopping_threshold_count = 0

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        model.train()
        
        for train_input in tqdm(train_dataloader):
            input_ids = train_input['input_ids'].to(device)
            attention_mask = train_input['attention_mask'].to(device)
            train_label = train_input['labels'].to(device)

           
            print("Unique train labels:", torch.unique(train_label))

            optimizer.zero_grad()  

            output = model(input_ids, attention_mask)
            loss = criterion(output, train_label)

            total_loss_train += loss.item()

            _, predicted = torch.max(output, 1)  # Get predicted classes
            acc = (predicted == train_label).sum().item()
            total_acc_train += acc

            loss.backward()
            optimizer.step()

        with torch.no_grad():
            total_acc_val = 0
            total_loss_val = 0
            
            model.eval()
            
            for val_input in tqdm(val_dataloader):
                input_ids = val_input['input_ids'].to(device)
                attention_mask = val_input['attention_mask'].to(device)
                val_label = val_input['labels'].to(device)

                print("Unique val labels:", torch.unique(val_label))

                output = model(input_ids, attention_mask)
                loss = criterion(output, val_label)

                total_loss_val += loss.item()

                _, predicted = torch.max(output, 1)
                acc = (predicted == val_label).sum().item()
                total_acc_val += acc
            
            print(f'Epochs: {epoch + 1} '
                  f'| Train Loss: {total_loss_train / len(train_dataloader): .3f} '
                  f'| Train Accuracy: {total_acc_train / len(train_dataloader.dataset): .3f} '
                  f'| Val Loss: {total_loss_val / len(val_dataloader): .3f} '
                  f'| Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')
            
            if best_val_loss > total_loss_val:
                best_val_loss = total_loss_val
                torch.save(model.state_dict(), "best_model_topic.pt")  # Save only the model's parameters
                print("Saved model")
                early_stopping_threshold_count = 0
            else:
                early_stopping_threshold_count += 1
                
            if early_stopping_threshold_count >= 3:  # Early stopping threshold
                print("Early stopping")
                break


In [None]:
# Download necessary NLTK resources, set random seeds for reproducibility
nltk.download('punkt')
nltk.download('stopwords')
torch.manual_seed(0)
np.random.seed(0)
 
# initialize tokenizer and model     
BERT_MODEL = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
base_model_topic = AutoModel.from_pretrained(BERT_MODEL)

# prepare data loaders
train_dataloader = DataLoader(TweetDataset(train_df, tokenizer), batch_size=8, shuffle=True, num_workers=0)
val_dataloader = DataLoader(TweetDataset(val_df, tokenizer), batch_size=8, num_workers=0)

for batch in train_dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    print("Input IDs shape:", input_ids.shape)  
    print("Attention Mask shape:", attention_mask.shape)  
    print("Labels shape:", labels.shape)  
    print("Unique labels in batch:", torch.unique(labels))  
    break  

model = TweetClassifier(base_model_topic)

learning_rate = 1e-5
epochs = 5
train(model, train_dataloader, val_dataloader, learning_rate, epochs)

In [None]:
# Loading the Model 
model = TweetClassifier(base_model_topic) 
model.load_state_dict(torch.load("best_model_topic.pt")) 
model.to(device)  
model.eval()  

In [None]:
# Performing Predictions

def get_text_predictions(model, loader):
    model.eval()  
    results_predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            predicted_classes = torch.argmax(outputs, dim=1)
            
            results_predictions.extend(predicted_classes.tolist())
            actual_labels.extend(labels.tolist())
    
    return results_predictions, actual_labels

test_dataloader = DataLoader(TweetDataset(test_df, tokenizer), batch_size=8, shuffle=False, num_workers=0)

# Get predictions and true labels
predictions, true_labels = get_text_predictions(model, test_dataloader)


In [None]:
# Calculate and print performance metrics (accuracy, precision, recall, F1-score) for the model's predictions.
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")