In [21]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from torch import cuda, no_grad
import re
import html
from datetime import datetime


class Discriminator():
    def __init__(self,
                 MODEL_DIR="models/bert_discriminator",
                 MODEL_PATH="distilbert-base-cased",
                 DATA_RAW_DIR="./data/raw/",
                 DATA_PROC_DIR="./data/preprocessed_/",
                 EOS_TOKEN='<|endoftext|>',
                 SEP_TOKEN='<\|reply\|>',
                 MAX_LENGTH=512,
                 TRAIN_RATIO=0.9,
                 BATCH_SIZE=4,
                 EPOCHS=1,
                 LEARNING_RATE=2e-5,
                 SEED=42
                 ):
              
        # Settings
        self.MODEL_DIR = MODEL_DIR
        self.MODEL_PATH = MODEL_PATH
        self.DATA_RAW_DIR = DATA_RAW_DIR
        self.DATA_PROC_DIR = DATA_PROC_DIR
        self.EOS_TOKEN = EOS_TOKEN
        self.SEP_TOKEN = SEP_TOKEN
        self.MAX_LENGTH = MAX_LENGTH
        self.TRAIN_RATIO = TRAIN_RATIO
        self.BATCH_SIZE = BATCH_SIZE
        self.EPOCHS = EPOCHS
        self.LEARNING_RATE = LEARNING_RATE
        self.SEED = SEED

        # Device
        self.torch_device = "cuda" if cuda.is_available() else "cpu"
        print("Using device: " + self.torch_device)

        # Tokenizer + Model
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased', do_lower_case=True) # !!!!need to retrain with do_lower_case=False
        self.model = AutoModelForSequenceClassification.from_pretrained(self.MODEL_PATH).to(self.torch_device)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

    def regex_text(self, text):
        text = html.unescape(text)
        text = re.sub(r"\\'", r"'", text)
        text = re.sub(r"\s+$", '', text)  
        texts = re.findall(self.SEP_TOKEN + " (,?.*)", text)
        for t in texts:
            if t:
                text = t
                break
        text = text.rstrip()
        return text

    def label_to_list(self, label):
            if label:
                return [1]
            else:
                return [0]

    def clean_dataframe(self, df):
        df = df[df['text'].str.contains(self.SEP_TOKEN)]
        df['text'] = df['text'].apply(self.regex_text)
        df = df[df['text'].str.len() != 0]
        df['label'] = df['label'].apply(self.label_to_list)
        return df
    
    def tokenize_function(self, examples):
        return self.tokenizer(examples["text"], truncation=True, padding=True, max_length=self.MAX_LENGTH)

    def preprocessing(self):
        ### Preprocessing
        train = pd.read_csv(self.DATA_PROC_DIR + "/fakes_train.csv", index_col=0, encoding='utf-8', engine='python')
        validation = pd.read_csv(self.DATA_PROC_DIR + "/fakes_validation.csv", index_col=0, encoding='utf-8', engine='python')

        validation = self.clean_dataframe(validation)
        train = self.clean_dataframe(train)

        validation.to_csv("test.csv")
        print("Saved!")

        dataset = dict()
        dataset['validation'] = Dataset.from_pandas(validation, preserve_index=False)
        dataset['train'] = Dataset.from_pandas(train, preserve_index=False)
        datasets = DatasetDict(dataset)

        tokenized_datasets = datasets.map(
            self.tokenize_function,
            batched=True,
            num_proc=1,
            remove_columns=["text"],
            )

        return tokenized_datasets

    def train_model(self, dataset, SAVE_STEPS=10000, model_name=None):
        training_args = TrainingArguments(
            output_dir=self.MODEL_DIR,
            evaluation_strategy="epoch",
            learning_rate=self.LEARNING_RATE,
            weight_decay=0.01,
            per_device_train_batch_size=self.BATCH_SIZE,
            per_device_eval_batch_size=self.BATCH_SIZE,
            num_train_epochs=self.EPOCHS,
            save_steps=SAVE_STEPS,
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            data_collator=self.data_collator,
        )

        trainer.train()

        if model_name:
            trainer.save_model(self.MODEL_DIR + "/" + model_name)
        else:
            now = datetime.now()
            dt_string = now.strftime("%Y-%m-%d_%H:%M:%S")
            trainer.save_model(self.MODEL_DIR + "/model-" + dt_string)

        
    def run_training_pipeline(self):
        dataset = self.preprocessing()
        self.train_model(dataset)

    def discriminate(self, texts):
        realistic_texts = []
        texts = [self.regex_text(text) for text in texts[:]]
        for text in texts:
            test_input = self.tokenizer(text, return_tensors='pt').to(self.torch_device)
            with no_grad():
                logits = self.model(**test_input).logits

            predicted_class_id = logits.argmax().item()

            if not predicted_class_id:
                realistic_texts.append(text)

        return realistic_texts

In [22]:
discriminator = Discriminator()
discriminator.preprocessing()

Using device: cuda


Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.b

Saved!


                                                                      

DatasetDict({
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 13726
    })
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 123421
    })
})

In [23]:
df = pd.read_csv('test.csv', index_col=0)

In [24]:
for row in df.values:
    print(row)

['> Andy could come out first, if he hasn\'t already. This might prompt Ben to say, "Cool, I\'m gay too" or "Oh really? Good to know. I'
 '[1]']
['[deleted] �� ��' '[1]']
['[deleted]' '[1]']
["My writing is good, and I feel like I have to do more with my writing. I've made a number of changes and I've improved on many more things. I'm getting rid of my"
 '[1]']
['I read it and liked it.' '[1]']
['I think this is a common misunderstanding. A story that you\'re trying to sell, doesn\'t have conflict because it\'s just something the reader can\'t tell. It\'s like saying "there\'s a'
 '[1]']
["I'd say not. The characters are there. If you write romance, the characters are there. The plot of the story is the same. If it's a romance, the story will be similar"
 '[1]']
['[deleted]' '[1]']
["Edit: It looks like it's working now." '[1]']
["I don't think the author of the book was qualified to be writing a book like that, though. I think it's important to remember that there are many ways that a