In [None]:
# Run the setup script first! Migrate to the correct folder and type bash setup.sh in the terminal.

# Import loads of stuff (I don't think I even use half of these.)
from datasets import load_dataset
from transformers import AutoTokenizer

from itertools import islice
from typing import Iterable, List, Tuple

import argparse
import numpy as np
import torch
import datasets
from datasets.dataset_dict import DatasetDict

from gensim.models.keyedvectors import KeyedVectors
from torch import nn
import random

random.seed(1337)


https://medium.com/dataseries/k-fold-cross-validation-with-pytorch-and-sklearn-d094aa00105f
Cross validation

https://medium.com/analytics-vidhya/text-summarization-using-nlp-3e85ad0c6349
text summarization

https://medium.com/analytics-vidhya/text-summarization-using-bert-gpt2-xlnet-5ee80608e961
More text summarization


PREPROCESSING DECISIONS:

1. Labels at every tenth percentile.

2. Max input sequence length is 512, as per standard BERT procedure.

3. New-lines removed.

In [None]:
# Load datasets
tifu_short_raw = load_dataset("reddit_tifu", "short", split="train")
tifu_long_raw = load_dataset("reddit_tifu", "long", split = "train")

In [None]:
# Add decimal labels to upvote ratio
def add_columns(example):
    example["upvote_ratio"] = round(example["upvote_ratio"], 1)
    example["labels"] = int(example["upvote_ratio"] * 10)
    return example

tifu_short = tifu_short_raw.map(add_columns)

tifu_long = tifu_long_raw.map(add_columns)

In [None]:

# Split into train, validation and test.
dict1 = tifu_short.train_test_split(test_size=0.1)

train_and_val = dict1["train"]
test_short = dict1["test"]

dict2 = train_and_val.train_test_split(test_size=0.2)

train_short = dict2["train"]
val_short = dict2["test"]

# Some light preprocessing:

def remove_new_lines(example):
    text = example["documents"]
    example["documents"] = text.replace('\n\n', '. ').replace('\n', '. ') # Replace double and single new lines with dots.
    return example

train_short = train_short.map(remove_new_lines)
val_short = val_short.map(remove_new_lines)
test_short = test_short.map(remove_new_lines)

In [None]:
# Tokenization time!


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


## LOOK HERE! If you want to train on full text, run as it is. If you want to train on titles, change 'documents' to 'title'.
def tokenization(example): 
    return tokenizer(example["documents"], padding = True, truncation= True, return_tensors = "pt")

# Tokenize and save to preprocessed. IMPORTANT: Don't type "git add ." ever! These files are too large to add to git, and
# it becomes very problematic to change once you've added and committed them!!!!
train_input = train_short.map(tokenization, batched = True)
val_input = val_short.map(tokenization, batched = True)
test_input = test_short.map(tokenization, batched = True)

train_input.save_to_disk("preprocessed/train_short")
val_input.save_to_disk("preprocessed/val_short")
test_input.save_to_disk("preprocessed/test_short")

In [None]:
# Import training script (and allow for changes to it)
import importlib
import train
importlib.reload(train)

# Let's train a model!
model_name = "bert-base-uncased"

from datasets import load_from_disk

# Your preprocessed train, val and test sets.
train_cl_short = load_from_disk("preprocessed/train_short")
val_cl_short = load_from_disk("preprocessed/val_short")
test_cl_short = load_from_disk("preprocessed/test_short")

from torch.utils.data import DataLoader

# The way we feed data to the model is using the dataloader class. It expects a very, very specific sort of input that looks like this:
train_cl_short = train_cl_short.remove_columns(['ups', 'num_comments', 'upvote_ratio', 'score', 'documents', 'tldr', 'title'])
train_cl_short.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
# And this:
val_cl_short = val_cl_short.remove_columns(['ups', 'num_comments', 'upvote_ratio', 'score', 'documents', 'tldr', 'title'])
val_cl_short.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

# Your input ids are your tokens, labels are the upvote ratio.


# This model takes a long-ass time to train even a single epoch on.
# These variables are here to ensure that you can test it on small samples.
small_train = train_cl_short.select(range(32))
small_val = val_cl_short.select(range(10))


# Note: If you want to run on the full data, change 'small_train' and 'small_val' to
# 'train_cl_short' and 'val_cl_short' respectively.
# Then go on holiday, run a marathon, write a book, get married, etc. while it runs.
train_dataloader = torch.utils.data.DataLoader(small_train, batch_size=32)
val_dataloader = torch.utils.data.DataLoader(small_val, batch_size = len(small_val))

#Import model and optimizer
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=11) # We have 11 labels.
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
## Train model for the very first time (if you want to load an already trained model
# to keep training it, use the code blpck below.)

## IMPORTANT! If you want the model to run until validation loss is minimized, epochs should just be set to False.
# In practice, however, these models take so long that I doubt we will ever get to the minimum,
# so specifying a number for epochs is probably a good idea.
epochs = 2
model_name = "test_"  # Replace with whatever else you want. The train function ensures that it is a .pt model.


train.train_model(model, train_dataloader, val_dataloader, optimizer, epochs, model_name = "test_")

In [14]:
## Train existing model
old_model_name = "models/" ## Add whatever your model is called. Remember, it's a .pt file!
old_model = torch.load(old_model_name)
train.train_model(old_model, train_dataloader, val_dataloader, optimizer, epochs, model_name = "test_")

[INFO:] Training classifier...
This is working for the  1 th time!
epoch: 2, loss = 2.2769
This is working for the  1 th time!
epoch: 3, loss = 2.2769
Max epochs reached
[INFO:] Finished traning!
