* Use the training dataset uploaded to Google Drive to **train** the **BERT** model using **AdamW Optimizer**

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig # Hugging Face transformers library
from torch.utils.data import Dataset, DataLoader # PyTorch for handling datasets and loading batches
from tqdm import tqdm # Progress bars
import torch # PyTorch functionalities
from sklearn.metrics import accuracy_score # Scikit-learn for evaluating classification accuracy
from torch.optim import AdamW as AdamW_Torch # AdamW Optimizer
from google.colab import drive # Use Google classes


drive.mount('/content/gdrive') # Mount Google Drive using Google Colab to access files stored in Google Drive
df = pd.read_csv("/content/gdrive/My Drive/CryptoSentiment/Datasets/train_set.csv") # Load the test dataset from GDrive

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Instantiate a BERT tokenizer from the Hugging Face transformers library
# Tokenize the text data using the BERT tokenizer
tokenized = tokenizer(list(df['text']), # Convert the column of the DataFrame into a Python list
                      padding=True, # Equal length sequences
                      truncation=True, # Truncate long reviews to fit the maximum allowed sequence length
                      return_tensors='pt') # Tokenizer output in PyTorch format  (PyTorch tensor - multi-dimensional array)

# Number of classes for sentiment_numerical classification are the number of unique labels in the 'sentiment_numerical' column
num_labels = len(df['sentiment_numerical'].unique())

class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = [label for label in labels]  # Labels are already starting from 0, else use 'labels': torch.tensor(self.labels[idx] - 1)}

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


def evaluate_model(model, dataloader, device): # Evaluate model's accuracy on a given dataloader
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: value.to(device) for key, value in batch.items()}
            labels = inputs["labels"]
            outputs = model(**inputs)
            logits = outputs.logits

            _, predicted = torch.max(logits, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_predictions)
    return accuracy

# Training dataset
train_dataset = CustomDataset(tokenized, # the output of BERT tokenizer
                              list(df['sentiment_numerical'])) # Instantiate CustomDataset class
train_dataloader = DataLoader(train_dataset, # The DataLoader will iterate over CustomDataset
                              batch_size=8, # Number of samples in each mini-batch for memory-efficiency
                              shuffle=True # Randomly shuffles the data at the beginning of each epoch, preventing model from learning the order of the data (useful for generalization)
                              ) # Create a PyTorch DataLoader for the entire dataset

# Validation dataset
validation_df = pd.read_csv("/content/gdrive/My Drive/CryptoSentiment/Datasets/validation_set.csv") # Load the validation dataset
tokenized_validation = tokenizer(list(validation_df['text']),
                                  padding=True,
                                  truncation=True,
                                  return_tensors='pt') # Tokenize the text data

validation_dataset = CustomDataset(tokenized_validation, list(validation_df['sentiment_numerical']))
validation_dataloader = DataLoader(validation_dataset, batch_size=8, shuffle=False)

# Load pre-trained BERT model with the correct num_labels
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

# Fine-tuning
optimizer = AdamW_Torch(model.parameters(), lr=2e-5) # AdamW optimizer from PyTorch with a learning rate of 2e-5

num_epochs = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU if it is available
model.to(device)

# Loop over Epochs and Perform training
for epoch in range(num_epochs):
    model.train()
    train_losses = []

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'): # Batch Loop with Progress Bar
        inputs = {key: value.to(device) for key, value in batch.items()} # Move input data to the selected device
        outputs = model(**inputs)
        loss = outputs.loss # Loss value from the model's output
        train_losses.append(loss.item()) # Append loss values to a list

        optimizer.zero_grad() # Clear the gradients of all optimized parameters
        loss.backward() # Compute gradients of the loss during backpropagation
        optimizer.step() # Update model's parameters based on the computed gradients using the chosen AdamW optimization algorithm

    # Validation loop
    model.eval()
    validation_losses = []

    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc=f'Validation - Epoch {epoch + 1}/{num_epochs}'):
            inputs = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**inputs)
            validation_losses.append(outputs.loss.item())

    # Assess the accuracy of the trained model on the validation set
    accuracy = evaluate_model(model, validation_dataloader, device)

    print(f'Epoch {epoch + 1}/{num_epochs} - Training Loss: {sum(train_losses) / len(train_losses):.4f} - Validation Loss: {sum(validation_losses) / len(validation_losses):.4f} - Validation Accuracy: {accuracy:.4f}')

# Save the fine-tuned model and tokenizer to a directory
model.save_pretrained('/content/gdrive/My Drive/CryptoSentiment/BERT/adamw')
tokenizer.save_pretrained('/content/gdrive/My Drive/CryptoSentiment/BERT/adamw')

Mounted at /content/gdrive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 400/400 [00:24<00:00, 16.43it/s]
Validation - Epoch 1/3: 100%|██████████| 100/100 [00:01<00:00, 80.42it/s]


Epoch 1/3 - Training Loss: 1.0235 - Validation Loss: 0.8284 - Validation Accuracy: 0.6488


Epoch 2/3: 100%|██████████| 400/400 [00:22<00:00, 17.41it/s]
Validation - Epoch 2/3: 100%|██████████| 100/100 [00:01<00:00, 80.22it/s]


Epoch 2/3 - Training Loss: 0.5510 - Validation Loss: 0.5133 - Validation Accuracy: 0.8137


Epoch 3/3: 100%|██████████| 400/400 [00:24<00:00, 16.24it/s]
Validation - Epoch 3/3: 100%|██████████| 100/100 [00:01<00:00, 79.31it/s]


Epoch 3/3 - Training Loss: 0.2430 - Validation Loss: 0.5751 - Validation Accuracy: 0.8225


('/content/gdrive/My Drive/CryptoSentiment/BERT/adamw/tokenizer_config.json',
 '/content/gdrive/My Drive/CryptoSentiment/BERT/adamw/special_tokens_map.json',
 '/content/gdrive/My Drive/CryptoSentiment/BERT/adamw/vocab.txt',
 '/content/gdrive/My Drive/CryptoSentiment/BERT/adamw/added_tokens.json')

* Use the trained **BERT** model (**AdamW Optimizer**) to **predict** ratings for the **test** dataset.

In [None]:
import pandas as pd
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm  # tqdm for the progress bar
from google.colab import drive
drive.mount('/content/gdrive')

# Load the test dataset from GDrive
test_df = pd.read_csv("/content/gdrive/My Drive/CryptoSentiment/Datasets/test_set.csv")

# Backup the original file by renaming it
os.rename("/content/gdrive/My Drive/CryptoSentiment/Datasets/test_set.csv", "/content/gdrive/My Drive/CryptoSentiment/Datasets/test_set_original1.csv")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU if available

def load_fine_tuned_bert_model(model_path):
    # Load the fine-tuned model and tokenizer
    model = BertForSequenceClassification.from_pretrained(model_path) # Load a pre-trained BERT model for sequence classification
    tokenizer = BertTokenizer.from_pretrained(model_path) # Load the tokenizer

    model.to(device) # Move model to GPU

    return model, tokenizer

def predict_star_rating(model, tokenizer, new_text, max_seq_length=512):
  # Tokenize and encode the new text
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(new_text)))

    # Truncate or split the sequence if it exceeds the maximum allowed length
    if len(tokens) > max_seq_length - 2:
        tokens = tokens[:max_seq_length - 2]

    input_ids = tokenizer.encode(tokens, return_tensors="pt").to(device)  # Tokenize and encode the input text using the BERT tokenizer

    # Make predictions
    with torch.no_grad():
        model.eval()  # Set the BERT model to evaluation mode
        logits = model(input_ids)[0]  # Pass the encoded input through the BERT model to obtain logits (raw predictions)
        predictions = torch.argmax(logits, dim=1).item()  # Get the predicted class by finding the index with the maximum value in the logits tensor

    # Return the predicted star rating
    return predictions

def process_review_and_predict_rating(review_content):
  model_path = '/content/gdrive/My Drive/CryptoSentiment/BERT/adamw'
  model, tokenizer = load_fine_tuned_bert_model(model_path)

  predicted_rating = predict_star_rating(model, tokenizer, review_content) # Load the fine-tuned BERT model and tokenizer
  return predicted_rating

# Create a progress bar for the loop
tqdm.pandas()

# Iterate through each row in the DataFrame
for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    review_content = row['text']

    # Process the review and predict the star rating
    predicted_rating = process_review_and_predict_rating(review_content)

    # Update the 'bert_adamw_ft_prediction' column
    test_df.at[index, 'bert_adamw_ft_prediction'] = predicted_rating

# Save the updated DataFrame back to the CSV file
test_df.to_csv("/content/gdrive/My Drive/CryptoSentiment/Datasets/test_set.csv", index=False)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


100%|██████████| 1000/1000 [17:28<00:00,  1.05s/it]


* Use the training dataset uploaded to Google Drive to **train** the **BERT** model using **Adam Optimizer**

In [7]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig # Hugging Face transformers library
from torch.utils.data import Dataset, DataLoader # PyTorch for handling datasets and loading batches
from tqdm import tqdm # Progress bars
import torch # PyTorch functionalities
from sklearn.metrics import accuracy_score # Scikit-learn for evaluating classification accuracy
from torch.optim import Adam # Adam Optimizer
from google.colab import drive # Use Google classes


drive.mount('/content/gdrive') # Mount Google Drive using Google Colab to access files stored in Google Drive
df = pd.read_csv("/content/gdrive/My Drive/CryptoSentiment/Datasets/train_set.csv") # Load the test dataset from GDrive

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Instantiate a BERT tokenizer from the Hugging Face transformers library
# Tokenize the text data using the BERT tokenizer
tokenized = tokenizer(list(df['text']), # Convert the column of the DataFrame into a Python list
                      padding=True, # Equal length sequences
                      truncation=True, # Truncate long reviews to fit the maximum allowed sequence length
                      return_tensors='pt') # Tokenizer output in PyTorch format  (PyTorch tensor - multi-dimensional array)

# Number of classes for sentiment_numerical classification are the number of unique labels in the 'sentiment_numerical' column
num_labels = len(df['sentiment_numerical'].unique())

class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = [label for label in labels]  # Labels are already starting from 0, else use 'labels': torch.tensor(self.labels[idx] - 1)}

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


def evaluate_model(model, dataloader, device): # Evaluate model's accuracy on a given dataloader
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: value.to(device) for key, value in batch.items()}
            labels = inputs["labels"]
            outputs = model(**inputs)
            logits = outputs.logits

            _, predicted = torch.max(logits, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_predictions)
    return accuracy

# Training dataset
train_dataset = CustomDataset(tokenized, # the output of BERT tokenizer
                              list(df['sentiment_numerical'])) # Instantiate CustomDataset class
train_dataloader = DataLoader(train_dataset, # The DataLoader will iterate over CustomDataset
                              batch_size=8, # Number of samples in each mini-batch for memory-efficiency
                              shuffle=True # Randomly shuffles the data at the beginning of each epoch, preventing model from learning the order of the data (useful for generalization)
                              ) # Create a PyTorch DataLoader for the entire dataset

# Validation dataset
validation_df = pd.read_csv("/content/gdrive/My Drive/CryptoSentiment/Datasets/validation_set.csv") # Load the validation dataset
tokenized_validation = tokenizer(list(validation_df['text']),
                                  padding=True,
                                  truncation=True,
                                  return_tensors='pt') # Tokenize the text data

validation_dataset = CustomDataset(tokenized_validation, list(validation_df['sentiment_numerical']))
validation_dataloader = DataLoader(validation_dataset, batch_size=8, shuffle=False)

# Load pre-trained BERT model with the correct num_labels
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

# Fine-tuning
optimizer = Adam(model.parameters(), lr=2e-5) # Adam optimizer from PyTorch with a learning rate of 2e-5

num_epochs = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU if it is available
model.to(device)

# Loop over Epochs and Perform training
for epoch in range(num_epochs):
    model.train()
    train_losses = []

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'): # Batch Loop with Progress Bar
        inputs = {key: value.to(device) for key, value in batch.items()} # Move input data to the selected device
        outputs = model(**inputs)
        loss = outputs.loss # Loss value from the model's output
        train_losses.append(loss.item()) # Append loss values to a list

        optimizer.zero_grad() # Clear the gradients of all optimized parameters
        loss.backward() # Compute gradients of the loss during backpropagation
        optimizer.step() # Update model's parameters based on the computed gradients using the chosen AdamW optimization algorithm

    # Validation loop
    model.eval()
    validation_losses = []

    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc=f'Validation - Epoch {epoch + 1}/{num_epochs}'):
            inputs = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**inputs)
            validation_losses.append(outputs.loss.item())

    # Assess the accuracy of the trained model on the validation set
    accuracy = evaluate_model(model, validation_dataloader, device)

    print(f'Epoch {epoch + 1}/{num_epochs} - Training Loss: {sum(train_losses) / len(train_losses):.4f} - Validation Loss: {sum(validation_losses) / len(validation_losses):.4f} - Validation Accuracy: {accuracy:.4f}')

# Save the fine-tuned model and tokenizer to a directory
model.save_pretrained('/content/gdrive/My Drive/CryptoSentiment/BERT/adam')
tokenizer.save_pretrained('/content/gdrive/My Drive/CryptoSentiment/BERT/adam')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 400/400 [00:24<00:00, 16.55it/s]
Validation - Epoch 1/3: 100%|██████████| 100/100 [00:01<00:00, 82.24it/s]


Epoch 1/3 - Training Loss: 1.0460 - Validation Loss: 0.9564 - Validation Accuracy: 0.5813


Epoch 2/3: 100%|██████████| 400/400 [00:22<00:00, 18.04it/s]
Validation - Epoch 2/3: 100%|██████████| 100/100 [00:01<00:00, 62.28it/s]


Epoch 2/3 - Training Loss: 0.7062 - Validation Loss: 0.5919 - Validation Accuracy: 0.7750


Epoch 3/3: 100%|██████████| 400/400 [00:23<00:00, 17.17it/s]
Validation - Epoch 3/3: 100%|██████████| 100/100 [00:02<00:00, 37.58it/s]


Epoch 3/3 - Training Loss: 0.3374 - Validation Loss: 0.4947 - Validation Accuracy: 0.8237


('/content/gdrive/My Drive/CryptoSentiment/BERT/adam/tokenizer_config.json',
 '/content/gdrive/My Drive/CryptoSentiment/BERT/adam/special_tokens_map.json',
 '/content/gdrive/My Drive/CryptoSentiment/BERT/adam/vocab.txt',
 '/content/gdrive/My Drive/CryptoSentiment/BERT/adam/added_tokens.json')

* Use the trained **BERT** model (**Adam Optimizer**) to **predict** ratings for the **test** dataset.

In [8]:
import pandas as pd
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm  # tqdm for the progress bar
from google.colab import drive
drive.mount('/content/gdrive')

# Load the test dataset from GDrive
test_df = pd.read_csv("/content/gdrive/My Drive/CryptoSentiment/Datasets/test_set.csv")

# Backup the original file by renaming it
os.rename("/content/gdrive/My Drive/CryptoSentiment/Datasets/test_set.csv", "/content/gdrive/My Drive/CryptoSentiment/Datasets/test_set_original2.csv")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU if available

def load_fine_tuned_bert_model(model_path):
    # Load the fine-tuned model and tokenizer
    model = BertForSequenceClassification.from_pretrained(model_path) # Load a pre-trained BERT model for sequence classification
    tokenizer = BertTokenizer.from_pretrained(model_path) # Load the tokenizer

    model.to(device) # Move model to GPU

    return model, tokenizer

def predict_star_rating(model, tokenizer, new_text, max_seq_length=512):
  # Tokenize and encode the new text
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(new_text)))

    # Truncate or split the sequence if it exceeds the maximum allowed length
    if len(tokens) > max_seq_length - 2:
        tokens = tokens[:max_seq_length - 2]

    input_ids = tokenizer.encode(tokens, return_tensors="pt").to(device)  # Tokenize and encode the input text using the BERT tokenizer

    # Make predictions
    with torch.no_grad():
        model.eval()  # Set the BERT model to evaluation mode
        logits = model(input_ids)[0]  # Pass the encoded input through the BERT model to obtain logits (raw predictions)
        predictions = torch.argmax(logits, dim=1).item()  # Get the predicted class by finding the index with the maximum value in the logits tensor

    # Return the predicted star rating
    return predictions

def process_review_and_predict_rating(review_content):
  model_path = '/content/gdrive/My Drive/CryptoSentiment/BERT/adam'
  model, tokenizer = load_fine_tuned_bert_model(model_path)

  predicted_rating = predict_star_rating(model, tokenizer, review_content) # Load the fine-tuned BERT model and tokenizer
  return predicted_rating

# Create a progress bar for the loop
tqdm.pandas()

# Iterate through each row in the DataFrame
for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    # review_id = row['review_id']
    review_content = row['text']

    # Process the review and predict the star rating
    predicted_rating = process_review_and_predict_rating(review_content)

    # Update the 'bert_adam_ft_prediction' column
    test_df.at[index, 'bert_adam_ft_prediction'] = predicted_rating

# Save the updated DataFrame back to the CSV file
test_df.to_csv("/content/gdrive/My Drive/CryptoSentiment/Datasets/test_set.csv", index=False)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


100%|██████████| 1000/1000 [15:37<00:00,  1.07it/s]
