In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
!pip install -q transformers
# !pip install -q pytorch-lightning

# Code for TPU packages install
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [40]:
!pip install -q sentencepiece

In [41]:
# Checking out the GPU we have access to
!nvidia-smi

Fri Dec 17 18:28:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    60W / 149W |  10285MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [42]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [43]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [44]:
df = pd.read_csv("/content/drive/MyDrive/Dissertation/data/pickled_for_colab.csv",encoding='latin-1')
df2 = pd.read_csv("/content/drive/MyDrive/Dissertation/data/alt_trunc.csv",encoding='latin-1')
df.head()

Unnamed: 0,episode_id,transcript,episode_description
0,spotify:episode:0uxOe2CiKSlveUjOJfKeKS,"Goodnight, everyone and welcome to the second...","For this new episode of the pod, we talk about..."
1,spotify:episode:4cpPyDPqUfu7PEmC7yy7Ng,"Good night, everyone. We're back for another ...",Sharing our memories of the late Kobe Bryant
2,spotify:episode:6gMuSIP80e8ln1dFH4OSW1,Good night everybody and welcome to the first...,"The journey begins! Hey guys! Eddie, Shaggy, a..."
3,spotify:episode:1CAhJzduXDVNs6Mtd0xVCe,What is up drop shippers? My name is Gordon. ...,"Dropshippers are curious, if not concerned, ab..."
4,spotify:episode:60XQxZ7TbJF0hVaOBN0Kwi,Hey dropshippers. My name is Patrick and I'm ...,Chargebacks are a nightmare for dropshippers. ...


In [45]:
#df2['transcripts'] = df2['transcripts'].str.split('<s>')[1]
df2['transcripts'] = df2.transcripts.str.replace("<s>","")
print(df2.head())
df_new = df
df_new['transcript'] = df2['transcripts']

   Unnamed: 0                                        transcripts
0           0   Goodnight, everyone and welcome to the second...
1           1   Good night, everyone. We're back for another ...
2           2   Good night everybody and welcome to the first...
3           3   Hey dropshippers. My name is Patrick and I'm ...
4           4   Oh, hey our special little Woody clitty liste...


In [46]:
df_new.episode_description = 'summarize: ' + df_new.episode_description
df.transcript = df2.transcripts
df_new.head()
df = df_new
df.head()

Unnamed: 0,episode_id,transcript,episode_description
0,spotify:episode:0uxOe2CiKSlveUjOJfKeKS,"Goodnight, everyone and welcome to the second...","summarize: For this new episode of the pod, we..."
1,spotify:episode:4cpPyDPqUfu7PEmC7yy7Ng,"Good night, everyone. We're back for another ...",summarize: Sharing our memories of the late Ko...
2,spotify:episode:6gMuSIP80e8ln1dFH4OSW1,Good night everybody and welcome to the first...,summarize: The journey begins! Hey guys! Eddie...
3,spotify:episode:1CAhJzduXDVNs6Mtd0xVCe,Hey dropshippers. My name is Patrick and I'm ...,"summarize: Dropshippers are curious, if not co..."
4,spotify:episode:60XQxZ7TbJF0hVaOBN0Kwi,"Oh, hey our special little Woody clitty liste...",summarize: Chargebacks are a nightmare for dro...


In [47]:
# Sections of config

# Defining some key variables that will be used later on in the training

#tokenizer = T5Tokenizer.from_pretrained("t5-base")
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
MAX_LEN = 512
SUMMARY_LEN = 144
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2
EPOCHS = 2
LEARNING_RATE = 1e-4

In [48]:
def remove_by_indices(descr, indxs):
  return [e for i, e in enumerate(descr) if i not in indxs]

In [49]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions


class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.episode_description
        self.ctext = self.data.transcript

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):

        _transcript = self.text.tolist()
        _description = self.ctext.tolist()

        indxs = []

        for i, des in enumerate(_transcript):
          if not isinstance(des, str):
            indxs.append(i)

        for i, des in enumerate(_description):
          if not isinstance(des, str):
            indxs.append(i)

        text = remove_by_indices(_transcript, indxs)
        ctext = remove_by_indices(_description, indxs)

        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())



        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [50]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=42).reset_index(drop=True)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

FULL Dataset: (5000, 3)
TRAIN Dataset: (4000, 3)
TEST Dataset: (1000, 3)


In [51]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [52]:
#model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
model = model.to(device)

In [53]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [54]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone().detach()
        labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labels)
        loss = outputs[0]
        
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [55]:
def writer(predictions, actuals):

    my_dict = {
        'Actual Headline': actuals,
        'Prediction': predictions       
        }
    final_output = pd.DataFrame(my_dict)

    return final_output


In [56]:
def validate(epoch):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

for epoch in range(1):
    predictions, actuals = validate(epoch)
    final_df = writer(predictions, actuals)
    final_df.to_csv('/content/drive/My Drive/Colab Notebooks/predictions_512_summ_acc_bart_weird_trunc.csv')
    print('Output Files generated for review')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  4.960236549377441
Epoch: 0, Loss:  2.5408949851989746
Epoch: 0, Loss:  7.178710460662842
Epoch: 0, Loss:  6.64028787612915
Epoch: 1, Loss:  7.220597267150879
Epoch: 1, Loss:  7.300714492797852
Epoch: 1, Loss:  7.642615795135498
Epoch: 1, Loss:  6.533679962158203


In [None]:
#from transformers import T5Tokenizer, T5ForConditionalGeneration

#tokenizer = T5Tokenizer.from_pretrained('t5-small')
#model = T5ForConditionalGeneration.from_pretrained('t5-small')
#input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
#outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
#loss, prediction_scores = outputs[:2]

#tokenizer = T5Tokenizer.from_pretrained('t5-small')
#model = T5ForConditionalGeneration.from_pretrained('t5-small')
#input_ids = tokenizer.encode_plus("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
#outputs = model.generate(input_ids['input_ids'])
#print(input_ids)
#y = input_ids['target_ids']
#acc = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
#pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in outputs]