In [1]:
!pip install rouge_score
!pip install accelerate -U

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=742c7666936d17e8ffe901b42be8b302705df307e13caee2985da2e5290ecfa9
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting 

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# load datasets
children_stories_df = pd.read_csv('/content/drive/My Drive/AppliedCV_data/children_stories.csv', encoding='ISO-8859-1')
children_books_df = pd.read_csv('/content/drive/My Drive/AppliedCV_data/children_books.csv', encoding='ISO-8859-1')


In [5]:
print(children_stories_df.head())

                            names     cats  \
0                   HIDE AND SEEK  Age 2-9   
1             GINGER THE GIRAFFE   Age 2-9   
2                 DOING MY CHORES  Age 2-9   
3             ABE THE SERVICE DOG  Age 2-9   
4   SUNNY MEADOWS WOODLAND SCHOOL  Age 2-9   

                                                desc  
0  Was it just another game of hide and seek? No....  
1  Read this warm tale of camaraderie and affecti...  
2  Love shines through this great illustrated kid...  
3  Abe was a real Service Dog who dedicated his l...  
4  The class took a little train and went deep in...  


In [6]:
children_stories_df = children_stories_df.drop(columns=['cats'])

In [7]:
print(children_stories_df.head())

                            names  \
0                   HIDE AND SEEK   
1             GINGER THE GIRAFFE    
2                 DOING MY CHORES   
3             ABE THE SERVICE DOG   
4   SUNNY MEADOWS WOODLAND SCHOOL   

                                                desc  
0  Was it just another game of hide and seek? No....  
1  Read this warm tale of camaraderie and affecti...  
2  Love shines through this great illustrated kid...  
3  Abe was a real Service Dog who dedicated his l...  
4  The class took a little train and went deep in...  


In [8]:
print(children_books_df.head())

                         Title  \
0  The Girl Who Drank the Moon   
1              Time Between Us   
2            Girl Out of Water   
3                      Captive   
4          The School of Music   

                                              Author  \
0                                     Kelly Barnhill   
1                               Tamara Ireland Stone   
2                                      Nat Luurtsema   
3                                       A J Grainger   
4  Rachel Bowen and Meurig Bowen Illustrator: Dan...   

                                                Desc Inerest_age  \
0  Every year, the evil Protectorate offers a bab...      10-14    
1  Sixteen-year-old Anna is struggling to underst...        13+    
2  Lou Brown's swimming ambitions sank without tr...      13-18    
3  Robyn is scared. Ever since the attempted assa...        13+    
4  Welcome to the School of Music. In charge is M...        10+    

                            Reading_age  
0  10-1

In [9]:
children_books_df = children_books_df.drop(columns=['Author', 'Inerest_age', 'Reading_age'])

In [10]:
print(children_books_df.head())

                         Title  \
0  The Girl Who Drank the Moon   
1              Time Between Us   
2            Girl Out of Water   
3                      Captive   
4          The School of Music   

                                                Desc  
0  Every year, the evil Protectorate offers a bab...  
1  Sixteen-year-old Anna is struggling to underst...  
2  Lou Brown's swimming ambitions sank without tr...  
3  Robyn is scared. Ever since the attempted assa...  
4  Welcome to the School of Music. In charge is M...  


In [11]:
# Rename columns to "Title" and "Desc"
children_stories_df.rename(columns={'names': 'Title', 'desc': 'Desc'}, inplace=True)

In [12]:
combined_df = pd.concat([children_books_df, children_stories_df], ignore_index=True)
print(combined_df.head())

                         Title  \
0  The Girl Who Drank the Moon   
1              Time Between Us   
2            Girl Out of Water   
3                      Captive   
4          The School of Music   

                                                Desc  
0  Every year, the evil Protectorate offers a bab...  
1  Sixteen-year-old Anna is struggling to underst...  
2  Lou Brown's swimming ambitions sank without tr...  
3  Robyn is scared. Ever since the attempted assa...  
4  Welcome to the School of Music. In charge is M...  


In [13]:
null_count = combined_df['Title'].isnull().sum()
print("Number of null values in the 'Title' column:", null_count)

null_count = combined_df['Desc'].isnull().sum()
print("Number of null values in the 'Desc' column:", null_count)

Number of null values in the 'Title' column: 0
Number of null values in the 'Desc' column: 0


In [14]:
print(combined_df)

                                                 Title  \
0                          The Girl Who Drank the Moon   
1                                      Time Between Us   
2                                    Girl Out of Water   
3                                              Captive   
4                                  The School of Music   
...                                                ...   
3694  Carrying the Elephant: A Memoir of Love and Loss   
3695                                      War and Peas   
3696                                     Love that Dog   
3697                              A Pilgrim's Progress   
3698                                    Secret Friends   

                                                   Desc  
0     Every year, the evil Protectorate offers a bab...  
1     Sixteen-year-old Anna is struggling to underst...  
2     Lou Brown's swimming ambitions sank without tr...  
3     Robyn is scared. Ever since the attempted assa...  
4     Welcome

In [15]:
book_details_df = pd.read_csv('/content/drive/My Drive/AppliedCV_data/book_details.csv', encoding='unicode_escape')


In [16]:
print(book_details_df.head())

                    title           author  rating  no_of_ratings  \
0               Divergent    Veronica Roth    4.15        3765886   
1           Catching Fire  Suzanne Collins    4.31        3305054   
2  The Fault in Our Stars       John Green    4.15        4851513   
3   To Kill a Mockingbird       Harper Lee    4.27        5784553   
4     The Lightning Thief     Rick Riordan    4.30        2752945   

  no_of_reviews                                        description  \
0       117,791  In Beatrice Prior's dystopian Chicago world, s...   
1       113,480  Sparks are igniting.Flames are spreading.And t...   
2       174,662  Despite the tumor-shrinking medical miracle th...   
3       112,055  The unforgettable novel of a childhood in a sl...   
4        87,446  Alternate cover for this ISBN can be found her...   

                                              genres  
0  Young Adult, Dystopia, Fantasy, Fiction, Scien...  
1  Young Adult, Dystopia, Fiction, Fantasy, Scien...  

In [17]:
relevant_genres = ['Young Adult', 'Young Adult Paranormal', 'Kids', 'Childrens Classics',
    'School Stories', 'Teen', 'Middle Grade', 'Childrens',
    'Young Adult Fantasy', 'Young Adult Romance'
]
genre_regex = '|'.join(relevant_genres)

filtered_df = book_details_df[book_details_df['genres'].str.contains(genre_regex, case=False, na=False)]
print(filtered_df)

                            title                 author  rating  \
0                       Divergent          Veronica Roth    4.15   
1                   Catching Fire        Suzanne Collins    4.31   
2          The Fault in Our Stars             John Green    4.15   
3           To Kill a Mockingbird             Harper Lee    4.27   
4             The Lightning Thief           Rick Riordan    4.30   
...                           ...                    ...     ...   
13278                   Wet Magic              E. Nesbit    3.75   
13284  The Ghost of Dibble Hollow  May Nickerson Wallace    4.41   
13295           The Foreshadowing        Marcus Sedgwick    3.80   
13321                      Hatter        Daniel  Coleman    3.90   
13322                     Shut Up           Anne Tibbets    3.97   

       no_of_ratings no_of_reviews  \
0            3765886       117,791   
1            3305054       113,480   
2            4851513       174,662   
3            5784553       112,

In [18]:
filtered_df = filtered_df[['title', 'description']]

In [19]:
print(filtered_df.head())

                    title                                        description
0               Divergent  In Beatrice Prior's dystopian Chicago world, s...
1           Catching Fire  Sparks are igniting.Flames are spreading.And t...
2  The Fault in Our Stars  Despite the tumor-shrinking medical miracle th...
3   To Kill a Mockingbird  The unforgettable novel of a childhood in a sl...
4     The Lightning Thief  Alternate cover for this ISBN can be found her...


In [20]:
filtered_df.rename(columns={'title': 'Title', 'description': 'Desc'}, inplace=True)
combined_df = pd.concat([combined_df, filtered_df], ignore_index=True)
print(combined_df)

                            Title  \
0     The Girl Who Drank the Moon   
1                 Time Between Us   
2               Girl Out of Water   
3                         Captive   
4             The School of Music   
...                           ...   
6547                    Wet Magic   
6548   The Ghost of Dibble Hollow   
6549            The Foreshadowing   
6550                       Hatter   
6551                      Shut Up   

                                                   Desc  
0     Every year, the evil Protectorate offers a bab...  
1     Sixteen-year-old Anna is struggling to underst...  
2     Lou Brown's swimming ambitions sank without tr...  
3     Robyn is scared. Ever since the attempted assa...  
4     Welcome to the School of Music. In charge is M...  
...                                                 ...  
6547  When four siblings journey to the seashore for...  
6548  Out of the graveyard comes a ghost--the ghost ...  
6549  It is 1915 and the First Wo

In [21]:
combined_df = combined_df.drop_duplicates(subset=['Title', 'Desc'])

In [22]:
print(combined_df)

                            Title  \
0     The Girl Who Drank the Moon   
1                 Time Between Us   
2               Girl Out of Water   
3                         Captive   
4             The School of Music   
...                           ...   
6547                    Wet Magic   
6548   The Ghost of Dibble Hollow   
6549            The Foreshadowing   
6550                       Hatter   
6551                      Shut Up   

                                                   Desc  
0     Every year, the evil Protectorate offers a bab...  
1     Sixteen-year-old Anna is struggling to underst...  
2     Lou Brown's swimming ambitions sank without tr...  
3     Robyn is scared. Ever since the attempted assa...  
4     Welcome to the School of Music. In charge is M...  
...                                                 ...  
6547  When four siblings journey to the seashore for...  
6548  Out of the graveyard comes a ghost--the ghost ...  
6549  It is 1915 and the First Wo

In [23]:
# Drop rows where 'Desc' column is None
combined_df = combined_df.dropna(subset=['Desc'])

In [24]:
print(combined_df.head())

                         Title  \
0  The Girl Who Drank the Moon   
1              Time Between Us   
2            Girl Out of Water   
3                      Captive   
4          The School of Music   

                                                Desc  
0  Every year, the evil Protectorate offers a bab...  
1  Sixteen-year-old Anna is struggling to underst...  
2  Lou Brown's swimming ambitions sank without tr...  
3  Robyn is scared. Ever since the attempted assa...  
4  Welcome to the School of Music. In charge is M...  


In [25]:
# preprocess text
import re
# Remove undesired special characters but keep alphanumeric, spaces, and common punctuation
combined_df['Desc'] = combined_df['Desc'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s,.?!;:()-]', ' ', x))

# Substitute multiple spaces with a single space
combined_df['Desc'] = combined_df['Desc'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Display the cleaned DataFrame
print(combined_df)

                            Title  \
0     The Girl Who Drank the Moon   
1                 Time Between Us   
2               Girl Out of Water   
3                         Captive   
4             The School of Music   
...                           ...   
6547                    Wet Magic   
6548   The Ghost of Dibble Hollow   
6549            The Foreshadowing   
6550                       Hatter   
6551                      Shut Up   

                                                   Desc  
0     Every year, the evil Protectorate offers a bab...  
1     Sixteen-year-old Anna is struggling to underst...  
2     Lou Brown s swimming ambitions sank without tr...  
3     Robyn is scared. Ever since the attempted assa...  
4     Welcome to the School of Music. In charge is M...  
...                                                 ...  
6547  When four siblings journey to the seashore for...  
6548  Out of the graveyard comes a ghost--the ghost ...  
6549  It is 1915 and the First Wo

In [26]:
#spli into dev and test
dev_df, test_df = train_test_split(combined_df, test_size=0.20, random_state=42)
# split dev into train and validation
train_df, val_df = train_test_split(dev_df, test_size=0.25, random_state=42)


print("Training Set Shape:", train_df.shape)
print("Validation Set Shape:", val_df.shape)
print("Test Set Shape:", test_df.shape)

Training Set Shape: (1968, 2)
Validation Set Shape: (656, 2)
Test Set Shape: (657, 2)


In [27]:
class T5BookDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, dataframe, max_length=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.title_texts = self.data['Title'].tolist()
        self.desc_texts = self.data['Desc'].tolist()
        self.max_length = max_length

    def __len__(self):
        return len(self.desc_texts)

    def __getitem__(self, idx):
        source_encoding = self.tokenizer(
            f"generate title: {self.desc_texts[idx]}",
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            self.title_texts[idx],
            max_length=64,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        source_ids = source_encoding['input_ids'].squeeze()
        target_ids = target_encoding['input_ids'].squeeze()

        return {"input_ids": source_ids, "attention_mask": source_encoding['attention_mask'].squeeze(), "labels": target_ids}

# Tokenizer and model setup
tokenizer = T5Tokenizer.from_pretrained('t5-base')
train_dataset = T5BookDataset(tokenizer, train_df)
val_dataset = T5BookDataset(tokenizer, val_df)
test_dataset = T5BookDataset(tokenizer, test_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
from transformers import AdamW
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

# Use AdamW Optimizer with lr=5e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


# train and validation dataloader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

cuda


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [29]:
# Training loop

# try 5 epochs
num_epochs = 5
best_val_loss = float('inf')
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Average Training Loss: {avg_train_loss:.4f}")

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch + 1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch + 1} - Average Validation Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained('./best_model')

Training Epoch 1:   0%|          | 0/492 [00:00<?, ?it/s]

Epoch 1 - Average Training Loss: 0.8302


Validating Epoch 1:   0%|          | 0/164 [00:00<?, ?it/s]

Epoch 1 - Average Validation Loss: 0.2420


Training Epoch 2:   0%|          | 0/492 [00:00<?, ?it/s]

Epoch 2 - Average Training Loss: 0.2487


Validating Epoch 2:   0%|          | 0/164 [00:00<?, ?it/s]

Epoch 2 - Average Validation Loss: 0.2289


Training Epoch 3:   0%|          | 0/492 [00:00<?, ?it/s]

Epoch 3 - Average Training Loss: 0.2276


Validating Epoch 3:   0%|          | 0/164 [00:00<?, ?it/s]

Epoch 3 - Average Validation Loss: 0.2263


Training Epoch 4:   0%|          | 0/492 [00:00<?, ?it/s]

Epoch 4 - Average Training Loss: 0.2108


Validating Epoch 4:   0%|          | 0/164 [00:00<?, ?it/s]

Epoch 4 - Average Validation Loss: 0.2235


Training Epoch 5:   0%|          | 0/492 [00:00<?, ?it/s]

Epoch 5 - Average Training Loss: 0.1928


Validating Epoch 5:   0%|          | 0/164 [00:00<?, ?it/s]

Epoch 5 - Average Validation Loss: 0.2235


In [30]:
# method to generate predicted text output using T5 model
def generate_text(input_ids, model, tokenizer, device):
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=input_ids.ne(tokenizer.pad_token_id),
        max_length=64,
        num_beams=5,
        no_repeat_ngram_size=2
    )
    # return the decded strings for each output in the batch, remove special tokens and clean up spaces
    return [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in outputs]


In [31]:
# load the best model
model = T5ForConditionalGeneration.from_pretrained('./best_model').to(device)
#Create test dataloader
test_loader = DataLoader(test_dataset, batch_size=4)

rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

model.eval()
bleu_references = []
bleu_candidates = []
rouge_scores = []

# evaluation loop
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        predictions = generate_text(input_ids, model, tokenizer, device)
        references = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in labels]

        # Collect references and candidates where both exist
        for ref, pred in zip(references, predictions):
            if ref and pred:  # Make sure neither is empty
                bleu_references.append([ref.split()])
                bleu_candidates.append(pred.split())
                scores = rouge.score(ref, pred)
                rouge_scores.append(scores)

if bleu_references and bleu_candidates:
    bleu_score = corpus_bleu(bleu_references, bleu_candidates)
    avg_rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)
    avg_rougeL = sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores)
    print(f"Average BLEU Score: {bleu_score:.4f}")
    print(f"Average ROUGE-1 F1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-L F1: {avg_rougeL:.4f}")
else:
    print("No valid predictions and references were found for evaluation.")

Testing:   0%|          | 0/165 [00:00<?, ?it/s]

Average BLEU Score: 0.1609
Average ROUGE-1 F1: 0.3389
Average ROUGE-L F1: 0.3352


In [32]:
def generate_book_title(description, model, tokenizer, device):
    # Prepare the text input by adding the appropriate "generate title:" prefix and encoding it
    input_text = "generate title: " + description
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    # Generate outputs using the model
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_length=64,
            # use beam search
            num_beams=10,
            no_repeat_ngram_size=2
        )

    # Decode the generated id to a string
    title = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return title


In [33]:
# Example usage
description = "It follows the Grinch, a cranky, solitary creature who attempts to thwart the public's Christmas plans by stealing Christmas gifts and decorations from the homes of the nearby town on Christmas Eve."
model = T5ForConditionalGeneration.from_pretrained('./best_model').to(device)
tokenizer = T5Tokenizer.from_pretrained('t5-base')

generated_title = generate_book_title(description, model, tokenizer, device)
print("Generated Title:", generated_title)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Title: The Grinch


In [34]:
# Example usage
description = "The book is about 11 year old Harry Potter, who receives a letter saying that he is invited to attend Hogwarts, school of witchcraft and wizardry. He then learns that a powerful wizard and his minions are after the sorcerer's stone that will make this evil wizard immortal and undefeatable."
generated_title = generate_book_title(description, model, tokenizer, device)
print("Generated Title:", generated_title)

Generated Title: Harry Potter and the Sorcerer's Stone


In [35]:
!zip -r best_model.zip ./best_model

  adding: best_model/ (stored 0%)
  adding: best_model/config.json (deflated 63%)
  adding: best_model/model.safetensors (deflated 8%)
  adding: best_model/generation_config.json (deflated 29%)


In [36]:
!cp best_model.zip '/content/drive/MyDrive'