# Install Transformers

In [1]:
# install transformers with sentencepiece
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Read input file from Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# open and read the file from google drive
file = open("/content/drive/MyDrive/corpus.txt", "r")
FileContent = file.read().strip()

In [7]:
# display file content
FileContent[0:1000]

"Yeah. Yeah, sure. It kinda does make sense, doesn't it, because when we get into the end of meeting we're kind of talking about action and design as opposed to background. Everything I have is kinda background. Mm-hmm. Uh that sounds. Sure. Okay. Sure. Yeah, cool. Why don't I get that? Hmm. Okay. Okay. Um alright so c is it function F_ eight? Hmm. Come on. I think it's working. Okay great s so let me just start this. Okay great. So um uh s move on. Uh-huh oh where'd it all go? It's not good. Okay lemme just see where I can find it. This looks more like it. I think I just opened up the template. Sorry about that. Okay alright so let's have a look here. Okay so this was the method that um I've taken. Uh basically what I wanna do here, before we get into it uh too far, is I want to show you all the background information I have that I think we need to acknowledge if we want this to be successful. And uh and then sorta g go through some of the way that I've dealt with that information, an

In [5]:
# total characters in the file
len(FileContent) 

39722

# Load the Model and Tokenizer

In [6]:
# import and initialize the tokenizer and model from the checkpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "sshleifer/distilbart-cnn-12-6"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

# Some model statistics

In [8]:
# max tokens including the special tokens
tokenizer.model_max_length 

1024

In [9]:
# max tokens excluding the special tokens
tokenizer.max_len_single_sentence 

1022

In [10]:
# number of special tokens [CLS] [SEP]
tokenizer.num_special_tokens_to_add() 

2

# Convert file content to sentences

In [11]:
# extract the sentences from the document
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
sentences = nltk.tokenize.sent_tokenize(FileContent)

In [14]:
len(sentences)

630

In [15]:
# find the max tokens in the longest sentence
max([len(tokenizer.tokenize(sentence)) for sentence in sentences])

93

In [19]:
sentences[0]

'Yeah.'

In [21]:
sentences[2]

"It kinda does make sense, doesn't it, because when we get into the end of meeting we're kind of talking about action and design as opposed to background."

# Create chunks

In [16]:
# initialize
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
  count += 1
  combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

  if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
    chunk += sentence + " " # add the sentence to the chunk
    length = combined_length # update the length counter

    # if it is the last sentence
    if count == len(sentences) - 1:
      chunks.append(chunk.strip()) # save the chunk
    
  else: 
    chunks.append(chunk.strip()) # save the chunk
    
    # reset 
    length = 0 
    chunk = ""

    # take care of the overflow sentence
    chunk += sentence + " "
    length = len(tokenizer.tokenize(sentence))
len(chunks)

10

# Sanity checks

In [17]:
chunks[0]

"Yeah. Yeah, sure. It kinda does make sense, doesn't it, because when we get into the end of meeting we're kind of talking about action and design as opposed to background. Everything I have is kinda background. Mm-hmm. Uh that sounds. Sure. Okay. Sure. Yeah, cool. Why don't I get that? Hmm. Okay. Okay. Um alright so c is it function F_ eight? Hmm. Come on. I think it's working. Okay great s so let me just start this. Okay great. So um uh s move on. Uh-huh oh where'd it all go? It's not good. Okay lemme just see where I can find it. This looks more like it. I think I just opened up the template. Sorry about that. Okay alright so let's have a look here. Okay so this was the method that um I've taken. Uh basically what I wanna do here, before we get into it uh too far, is I want to show you all the background information I have that I think we need to acknowledge if we want this to be successful. And uh and then sorta g go through some of the way that I've dealt with that information, an

In [18]:
len(chunks[0])

4368

In [25]:
#num sentences in a chunk

chunk_sentences = nltk.tokenize.sent_tokenize(chunks[0])

In [26]:
len(chunk_sentences)

50

In [27]:
chunk_sentences

['Yeah.',
 'Yeah, sure.',
 "It kinda does make sense, doesn't it, because when we get into the end of meeting we're kind of talking about action and design as opposed to background.",
 'Everything I have is kinda background.',
 'Mm-hmm.',
 'Uh that sounds.',
 'Sure.',
 'Okay.',
 'Sure.',
 'Yeah, cool.',
 "Why don't I get that?",
 'Hmm.',
 'Okay.',
 'Okay.',
 'Um alright so c is it function F_ eight?',
 'Hmm.',
 'Come on.',
 "I think it's working.",
 'Okay great s so let me just start this.',
 'Okay great.',
 'So um uh s move on.',
 "Uh-huh oh where'd it all go?",
 "It's not good.",
 'Okay lemme just see where I can find it.',
 'This looks more like it.',
 'I think I just opened up the template.',
 'Sorry about that.',
 "Okay alright so let's have a look here.",
 "Okay so this was the method that um I've taken.",
 'Uh basically what I wanna do here, before we get into it uh too far, is I want to show you all the background information I have that I think we need to acknowledge if we wan

In [28]:
[len(tokenizer.tokenize(c)) for c in chunks]

[1014, 984, 960, 1005, 1016, 994, 991, 1005, 1017, 576]

In [29]:
[len(tokenizer(c).input_ids) for c in chunks]

[1016, 986, 962, 1007, 1018, 996, 993, 1007, 1019, 578]

## With special tokens added

In [30]:
sum([len(tokenizer(c).input_ids) for c in chunks])

9582

In [31]:
len(tokenizer(FileContent).input_ids)

Token indices sequence length is longer than the specified maximum sequence length for this model (9564 > 1024). Running this sequence through the model will result in indexing errors


9564

## Without special tokens added

In [32]:
sum([len(tokenizer.tokenize(c)) for c in chunks])

9562

In [33]:
len(tokenizer.tokenize(FileContent))

9562

# Get the inputs

In [34]:
# inputs to the model using pytorch as pt here

inputs = [tokenizer(chunk, return_tensors="pt") for chunk in chunks]

# Output

In [35]:
j = 0
for input in inputs:
  output = model.generate(**input)
  print('====================chunk ', j, '===============================')
  print(tokenizer.decode(*output, skip_special_tokens=True))
  j = j+1



 Market research shows that TV remote control has a fancy look and feel, not a functional look or or feel, the number one thing that was found was that television remote control was not functional. Number two was that it be innovative without a adding unnecessary functional bits to it, and third priority is that it has to be user friendly while still having technology.
 Style is number one thing in the in the market of who we're selling to. Innovative design technology's also a must in that it's seen it'd be seen to be uh cutting edge, but ease of use t has to be insured throughout. And then at the end there are vibrant natural colours.
 We need to have something that unifies a lot of the different concepts, and if we think that what we are w our number one marketing motive is the look and feel. We are leaning quite a bit to the side of being low-tech, rubber buttons plastic frame, it's almost like we're reproducing the remote control that's out there. We're gonna need to put in a real