* Upload files to GDrive



In [None]:
# from google.colab import files
# uploaded = files.upload()
# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

*   To handle files from Google Drive, Colab should have the necessary permissions to mount the drive.

In [None]:
# import pandas as pd
# # Give permissions to colab handling files from gdrive
# from google.colab import drive
# drive.mount('/content/gdrive')
# # Check if the file exists
# import os
# file_path = '/content/gdrive/My Drive/SentimentBERT/dataset-test.csv'
# if os.path.exists(file_path):
#     print("File exists.")
# else:
#     print("File does not exist.")

* Use the training dataset uploaded to Google Drive to **train** the **BERT** model using **AdamW Optimizer**

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from google.colab import drive # Use Google classes

drive.mount('/content/gdrive') # Mount Google Drive using Google Colab to access files stored in Google Drive

# Load the datasets
train_path = "/content/gdrive/My Drive/ElectionsComments/Datasets/train_dataset.csv"
val_path = '/content/gdrive/My Drive/ElectionsComments/Datasets/validation_dataset.csv'

train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)

# Encode the sentiment labels
label_encoder = LabelEncoder()
train_data['sentiment'] = label_encoder.fit_transform(train_data['sentiment'])
val_data['sentiment'] = label_encoder.transform(val_data['sentiment'])

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        comment = str(self.data.comment[index])
        sentiment = self.data.sentiment[index]

        inputs = self.tokenizer(
            comment,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'comment': comment,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'sentiment': torch.tensor(sentiment, dtype=torch.long)
        }

# Set up the BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create datasets and dataloaders
max_len = 128
train_dataset = CustomDataset(train_data, tokenizer, max_len)
val_dataset = CustomDataset(val_data, tokenizer, max_len)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop with validation
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs} (Training)'):
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'token_type_ids': batch['token_type_ids'].to(device),
            'labels': batch['sentiment'].to(device)
        }

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    model.eval()
    val_loss = 0.0
    val_correct = 0

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f'Epoch {epoch + 1}/{epochs} (Validation)'):
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'token_type_ids': batch['token_type_ids'].to(device),
                'labels': batch['sentiment'].to(device)
            }

            outputs = model(**inputs)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()

            predictions = torch.argmax(logits, dim=1)
            val_correct += (predictions == batch['sentiment'].to(device)).sum().item()

    val_accuracy = val_correct / len(val_dataset)
    avg_val_loss = val_loss / len(val_dataloader)

    print(f'Epoch {epoch + 1}/{epochs} - Training Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

# Save the fine-tuned model and tokenizer
output_dir = '/content/gdrive/My Drive/ElectionsComments/Datasets/BERT/adamw'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Save the label encoder
label_encoder_path = '/content/gdrive/My Drive/ElectionsComments/Datasets/BERT/label_encoder.pkl'
joblib.dump(label_encoder, label_encoder_path)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3 (Training): 100%|██████████| 175/175 [00:12<00:00, 14.27it/s]
Epoch 1/3 (Validation): 100%|██████████| 38/38 [00:01<00:00, 31.81it/s]


Epoch 1/3 - Training Loss: 0.4379, Validation Loss: 0.8982, Validation Accuracy: 0.5933


Epoch 2/3 (Training): 100%|██████████| 175/175 [00:12<00:00, 14.34it/s]
Epoch 2/3 (Validation): 100%|██████████| 38/38 [00:01<00:00, 32.23it/s]


Epoch 2/3 - Training Loss: 0.4033, Validation Loss: 0.8923, Validation Accuracy: 0.6233


Epoch 3/3 (Training): 100%|██████████| 175/175 [00:12<00:00, 14.37it/s]
Epoch 3/3 (Validation): 100%|██████████| 38/38 [00:01<00:00, 33.89it/s]


Epoch 3/3 - Training Loss: 0.3177, Validation Loss: 0.8628, Validation Accuracy: 0.6300


['/content/gdrive/My Drive/ElectionsComments/Datasets/BERT/label_encoder.pkl']

* Use the trained **BERT** model (**AdamW Optimizer**) to **predict** ratings for the **test** dataset.

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from tqdm import tqdm
import joblib
from google.colab import drive

drive.mount('/content/gdrive')

# Load the fine-tuned model and tokenizer
model_dir = '/content/gdrive/My Drive/ElectionsComments/Datasets/BERT/adamw'
model = BertForSequenceClassification.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained(model_dir)

# Load the label encoder
label_encoder_path = '/content/gdrive/My Drive/ElectionsComments/Datasets/BERT/label_encoder.pkl'
label_encoder = joblib.load(label_encoder_path)

# Load data to make predictions on (replace with your dataset)
data_path = "/content/gdrive/My Drive/ElectionsComments/Datasets/test_dataset.csv"
data = pd.read_csv(data_path)

# Tokenize and prepare inputs for the model
tokenized_inputs = tokenizer(data['comment'].tolist(), add_special_tokens=True, max_length=128, padding='max_length', return_tensors='pt')

# Make predictions
model.eval()
predicted_sentiments = []

with torch.no_grad():
    for i in tqdm(range(0, len(data), 8), desc='Making Predictions'):
        batch_inputs = {key: val[i:i+8] for key, val in tokenized_inputs.items()}

        outputs = model(**batch_inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        # Decode predicted labels using label encoder
        batch_predictions = label_encoder.inverse_transform(predictions.cpu().numpy())
        predicted_sentiments.extend(batch_predictions)

# Add predictions to the original DataFrame
data['bert_predictions'] = predicted_sentiments

# Save the updated DataFrame to a new CSV file
output_combined_file = '/content/gdrive/My Drive/ElectionsComments/Datasets/BERT/combined_test_predictions.csv'
data.to_csv(output_combined_file, index=False)

# Display predictions
print(data[['comment', 'bert_predictions']])

Mounted at /content/gdrive


Making Predictions: 100%|██████████| 38/38 [02:16<00:00,  3.58s/it]

                                               comment  bert_predictions
0      over 1000 doctors appeal modi enforce ecigs ban                 0
1    what saying you are with modi you are chutiya ...                 1
2    blog mantri backmodi jumlebaazrahul not modi b...                 1
3    why dont fight from varanasi modi left his hom...                 0
4    dear hes not just like you varun dhawan others...                 1
..                                                 ...               ...
295                        modi phir seso vote for bjp                 0
296                        india needs modi government                 0
297  our beloved prime minister modi will the prime...                 1
298  but election dates get nearer bjp support dwin...                 1
299  wife sri modi the first family definition corr...                 1

[300 rows x 2 columns]



