# v7 - BERT used as feature extractor and tokenizer 

But not directly as a model.

In [7]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, BertweetTokenizer
import torch 

from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [8]:
def load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt'):
    # Load data, txt as csv
    #data_path = 'data/twitter-datasets/'
    df_train_pos = pd.read_csv(path_pos, sep = '\t', names = ['tweet'])
    df_train_pos['label'] = 1
    df_train_neg = pd.read_csv(path_neg, sep = '\t', names = ['tweet'], on_bad_lines='skip')
    df_train_neg['label'] = -1
    df_train = pd.concat([df_train_pos, df_train_neg], ignore_index=True)
    print('Train set: ', df_train.shape)
    print('Train set positives: ', df_train_pos.shape)
    print('Train set negatives: ', df_train_neg.shape)
    return df_train 

In [15]:
# Read in the dataset
df = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')

# Preprocessing: Tokenize the tweets using BERT
def preprocess(text, tokenizer):
    # Tokenize the text using BERT
    input_ids = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
    # Convert the input ids to a tensor
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    # Return the input ids
    return input_ids

# Load the BERT tokenizer
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertweetTokenizer.from_pretrained('vinai/bertweet-base')

# Tokenize the tweets
df['tweet'] = df['tweet'].apply(lambda x: preprocess(x, tokenizer))

# Split the data into a training set and a test set
X_train, X_eval, y_train, y_eval = train_test_split(df['tweet'], df['label'], test_size=0.2)

# Convert the training and test sets to tensors
device = torch.device('mps')
train_set = TensorDataset(X_train, y_train)
test_set = TensorDataset(X_eval, y_eval)
# Create dataloaders for the training and test sets
batch_size = 8
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

# Train a BERT model
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = BertForSequenceClassification.from_pretrained('vinai/bertweet-base', num_labels=2)
model.train()

import torch.nn as nn
import torch.optim as optim
# Train the model on the training set
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

for epoch in range(5):
    running_loss = 0.0
    for inputs, labels in train_dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch+1}: Loss = {running_loss/len(train_dataloader):.4f}')

# Evaluate the model on the test set
y_pred = []
for inputs, labels in test_dataloader:
    inputs = inputs.to(device)
    labels = labels.to(device)

    # Forward pass
    outputs = model(inputs)
    _, predicted = torch.max(outputs.data, 1)
    y_pred.extend(predicted)


Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


TypeError: 'int' object is not callable