In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Trainer (has labels)
trainer_df = pd.read_csv('datasets/cleaned_SOLIDtest6K_trainer.tsv', sep="\t")
learner_tweets_df = pd.read_csv('datasets/cleaned_SOLID9M_learner.tsv', sep="\t")

In [2]:
import numpy as np
# trainer_labels = trainer_df['label'].values
# trainer_tweets = trainer_df['tweet'].values
learner_tweets_df['labels'] = learner_tweets_df['average'].apply(lambda x: 1 if x >= 0.8 else 0) # threshold the average values

sample_size = 80000
positive_ratio = 0.75

# Select the most confident positive values
semi_tweets_pos_df = learner_tweets_df[learner_tweets_df['average'] > 0.8].sample(n=np.floor(sample_size*positive_ratio).astype(int), random_state=1)

# Select the most confident negative values
semi_tweets_neg_df = learner_tweets_df[learner_tweets_df['average'] < 0.2].sample(n=np.floor(sample_size*(1-positive_ratio)).astype(int), random_state=1)

semi_tweets_df = pd.concat([semi_tweets_pos_df, semi_tweets_neg_df])
semi_tweets_df = semi_tweets_df.sample(frac=1, random_state=42)

semi_tweets = semi_tweets_df['text'].values
semi_labels = semi_tweets_df['labels'].values


In [3]:
semi_tweets_df.head(10)

Unnamed: 0,id,text,average,std,labels
2348858,1140493726000787462,cannot no nigga get next to me like i really d...,0.826405,0.162585,1
389301,1160578706026270720,this movie used to fuck me up,0.827979,0.167209,1
7847151,1188696905586528256,my eye was fascinated by that memorandum,0.196262,0.158945,0
2782796,1161519296075505665,oh well thats good sleep well then,0.182599,0.193224,0
400928,1160317975276208128,wearing your butt plug to ur dinner date so he...,0.864163,0.143021,1
7010621,1186516651325280256,who the fuck ate all my cum,0.867288,0.165063,1
2190200,1159412408521318400,and you know the ish is staged and not genuine...,0.188691,0.168918,0
1595615,1161920544314294272,adnthweeksary for us to regard others as wo...,0.153824,0.174742,0
5239813,1157885199545319424,ill eat that son of a bitch,0.871429,0.151313,1
6244358,1187425309496352769,why you lie to her like that i fucked some la...,0.813525,0.162951,1


In [4]:
import torch
from torch.utils.data import DataLoader, Dataset


class TweetDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        if self.labels is not None:
            label = self.labels[idx]
            return {
                'text': text,
                'label': label
            }
        else:
            return {
                'text': text
            }
        
# trainer_dataset = TweetDataset(trainer_tweets, trainer_labels)
learner_dataset = TweetDataset(semi_tweets, semi_labels)

# trainer_loader = DataLoader(trainer_dataset, batch_size=12, shuffle=True)
learner_loader = DataLoader(learner_dataset, batch_size=6, shuffle=False)

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch import optim
from tqdm import tqdm

num_epochs = 1

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


model = BertForSequenceClassification.from_pretrained('models/OLID_BERT_1', num_labels=2)
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
model.train()
for epoch in range(num_epochs):
    # trainer_dataloader_iterator = iter(trainer_loader)
    semi_labeled_dataloader_iterator = iter(learner_loader)
    num_batches = len(learner_loader)
    # num_batches = 5000
    for _ in tqdm(range(num_batches)):
        # Train on labeled data
        batch = next(semi_labeled_dataloader_iterator, None)
        if batch is not None:
            inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt").to(device)
            labels = torch.tensor(batch['label']).clone().detach().to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

  labels = torch.tensor(batch['label']).clone().detach().to(device)
100%|██████████| 13334/13334 [12:25<00:00, 17.90it/s]


In [7]:
model.save_pretrained('models/SOLID_80ksemi_OLIDBERT_3')