In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Trainer (has labels)
trainer_df = pd.read_csv('datasets/cleaned_SOLIDtest6K_trainer.tsv', sep="\t")
learner_tweets_df = pd.read_csv('datasets/cleaned_SOLID9M_learner.tsv', sep="\t")

In [78]:
trainer_labels = trainer_df['label'].values
trainer_tweets = trainer_df['tweet'].values
learner_tweets = learner_tweets_df['text'].values

In [82]:
# train_texts, val_texts, train_labels, val_labels = train_test_split(trainer_tweets, trainer_labels, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the training and validation texts
# trainer_encodings = tokenizer(trainer_tweets.tolist(), truncation=True, padding=True)
# learner_encodings = tokenizer(learner_tweets.tolist(), truncation=True, padding=True)

In [83]:
import torch
from torch.utils.data import DataLoader, Dataset


class TweetDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        if self.labels is not None:
            label = self.labels[idx]
            return {
                'text': text,
                'label': label
            }
        else:
            return {
                'text': text
            }
        
trainer_dataset = TweetDataset(trainer_tweets, trainer_labels, tokenizer=tokenizer)
learner_dataset = TweetDataset(learner_tweets, tokenizer=tokenizer)

trainer_loader = DataLoader(trainer_dataset, batch_size=12, shuffle=True)
learner_loader = DataLoader(learner_dataset, batch_size=12, shuffle=False)

In [84]:
import torch
from transformers import BertForSequenceClassification
from torch import optim
from tqdm import tqdm

num_epochs = 3
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [85]:
model.train()
for epoch in range(num_epochs):
    trainer_dataloader_iterator = iter(trainer_loader)
    unlabeled_dataloader_iterator = iter(learner_loader)
    num_batches = min(len(trainer_loader), len(learner_loader))
    for _ in tqdm(range(num_batches)):
        # Train on labeled data
        labeled_batch = next(trainer_dataloader_iterator, None)
        if labeled_batch is not None:
            inputs = tokenizer(labeled_batch['text'], padding=True, truncation=True, return_tensors="pt").to(device)
            labels = torch.tensor(labeled_batch['label']).to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Train on unlabeled data with pseudo-labeling
        unlabeled_batch = next(unlabeled_dataloader_iterator, None)
        if unlabeled_batch is not None:
            inputs = tokenizer(unlabeled_batch['text'], padding=True, truncation=True, return_tensors="pt").to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            pseudo_labels = torch.argmax(logits, dim=1)  # Assuming binary classification
            # Concatenate pseudo-labeled data with original labeled data
            combined_inputs = tokenizer(labeled_batch['text'] + unlabeled_batch['text'], padding=True, truncation=True, return_tensors="pt").to(device)
            combined_labels = torch.cat([labels, pseudo_labels]).to(device)
            outputs = model(**combined_inputs)
            combined_logits = outputs.logits
            loss = criterion(combined_logits, combined_labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


  0%|          | 0/500 [00:00<?, ?it/s]

  labels = torch.tensor(labeled_batch['label']).to(device)
100%|██████████| 500/500 [01:32<00:00,  5.43it/s]
100%|██████████| 500/500 [01:30<00:00,  5.54it/s]
100%|██████████| 500/500 [01:36<00:00,  5.17it/s]


In [14]:
model.save_pretrained('models/SOLID_BERT_3_cleaned')