In this project we will create a Book-title generator system in the style of a given author. For this we will use titles of the published books by that author to generate a list of fake-titles that sound similar. We will then use GoodReads rating to calculate which of those titles are similar to the most popular books, and sort the list that way.

*Code based in the labs and the tutorial from https://towardsdatascience.com/how-to-fine-tune-gpt-2-for-text-generation-ae2ea53bc272*

# **Install, import and load dataset**

In [None]:
!pip install transformers 

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange
import torch.nn.functional as F
import pandas as pd
import numpy as np
import random
import torch
import glob
import csv

In [3]:
# Create class to store book titles
class BookNames(Dataset):  
    def __init__(self, control_code, gpt2_type="gpt2"):

        # Store tokenized names
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.names = []

        for row in tqdm(names):
          # I don't think any name will have more than 1024 tokens
          self.names.append(torch.tensor(self.tokenizer.encode(f"{row}<|endoftext|>")))           
        self.names_count = len(self.names)
        
    def __len__(self):
        return self.names_count

    def __getitem__(self, item):
        return self.names[item]

**Load data**

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

# Load the GoodReads dataset
book_rating = pd.DataFrame()
for file in tqdm(glob.glob("gdrive/MyDrive/nlp-app-II/data/GoodReads/book*.csv")):
  df = pd.read_csv(file)
  if book_rating.empty:
      book_rating = df
  else:
      book_rating = book_rating.append(df, ignore_index=True)

# Select only titles in english
book_rating = book_rating[book_rating['Language']=='eng']

Mounted at /content/gdrive


100%|██████████| 23/23 [00:53<00:00,  2.33s/it]


In [5]:
# Select the colummns of book titles
names = book_rating['Name']
print(' Total number of book titles in english:', len(names))

# Create dataset class
dataset = BookNames(names)  

 Total number of book titles in english: 180338


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…

  0%|          | 0/180338 [00:00<?, ?it/s]




100%|██████████| 180338/180338 [00:31<00:00, 5800.48it/s]


# **Fine tune GPT-2**

We will fine-tune GPT-2 using all the book titles (in english) avaiable in the GoodReads dataset. Most of the code is taken from the article mentioned above. Training the model can take up to an hour; the last cells in this section are commented since I load load a pre-trained model stored in my drive folder in the next section.

In [6]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [7]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None

    return model

**Train and store model**

In [8]:
#model = GPT2LMHeadModel.from_pretrained('gpt2')
#model = train(dataset, model, tokenizer)

In [9]:
#torch.save(model, "/content/gdrive/MyDrive/nlp-app-II/model.tar")

# Load model

Loads a pre-trained model and defines the necesary functions to generate new book titles.

In [10]:
# Load model I pretrained previously
model = torch.load("/content/gdrive/MyDrive/nlp-app-II/model.tar")

**Define functions to generate titles**

In [11]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=20, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in range(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple titles.
def text_generation(input, rand, num):
  generated_titles = []
  for i in trange(num):
    x = generate(model.to('cpu'), tokenizer, '\n'.join(random.sample(list(input), rand)) + '\n', entry_count=1)
    generated_titles.append(x)
  return generated_titles

# **Generation of new titles**

**Select author and generate number of similar titles**

Change this cell:

In [12]:
author = 'Noam Chomsky'
generateNumber = 10

In [13]:
# Select rows of books written by that author
book_rating_author = book_rating[book_rating['Authors']==author]

# Check if we have books of that author
if book_rating_author.empty:
  print("\033[91m {}\033[00m" .format('Author not found in dataset '))
else:
  print("\033[92m {}\033[00m" .format('Author found!\t' + author))
  
  # Select the colummns of book titles
  names_author = book_rating_author['Name']

  # We will create a set to remove repeated entries in the dataset (Different publishers, editions...) Still, some books will be repeated
  names_author = set(names_author)

  print(' Total number of book titles in english:', len(names_author))

[92m Author found!	Noam Chomsky[00m
 Total number of book titles in english: 72


In [14]:
generate(model.to('cpu'), tokenizer, 'i am happy to be here =', entry_count=1)

['i am happy to be here =DDD\n\n\nwicked (Celeste Welch) - Do not Lace Your Clothes<|endoftext|>']

In [15]:
#Run the function to generate the names
rand = min(10, len(names_author)) # Number of book titles we will use to generate a new one, 10
generated_names = text_generation(names_author, rand, generateNumber) 

# Curate output
list_names = []
print('\n\nOutput directly from model:')
for title in generated_names:
  print(title)
  list_names.append(title[-1].split('<')[0].split('\n')[-1])

100%|██████████| 10/10 [00:48<00:00,  4.89s/it]



Output directly from model:
["Hegemony or Survival: America's Quest for Global Dominance\nThe Emerging Framework of World Power\nTurning the Tide: US Intervention in Central America & the Struggle for Peace\nRules and Representations: Woodbridge Lecture 11\nToward a New Cold War: Essays on the Current Crisis & How We Got There\nMiddle East Illusions: Including Peace in the Middle East?\nThe  Prosperous Few and the Restless Many (Real Story)\nNew War on Terrorism: Fact or Fiction\nFor Reasons of State\nThe Imperial Presidency\nThe Bottom Line<|endoftext|>"]
['Towards a New Cold War\nPirates and Emperors, Old and New\nMedia Control: The Spectacular Achievements of Propaganda\nPerspectives on Power: Reflections on Human Nature & the Social Order\nRethinking Camelot: JFK, the Vietnam War and US Political Culture\nAn American Addiction\nLatin America: From Colonization to Globalization\nThe Emerging Framework of World Power\nThe New Military Humanism: Lessons from Kosovo\nOn Democracy & E




**Print list of titles** Removing already existing ones

In [16]:
generated = list(set(list_names) - set(names))
generated

['Toward a New Cold War',
 'The Survival of Democracies',
 'In this Year',
 '"Linguistic Critique of Language and Its Relationship to Ideas and Interpretations"',
 'Topics in Logic',
 'The Bottom Line',
 'Empowering Others: Inspiring Educators',
 'Nonviolence in Political Culture',
 'Apostle',
 'The Birth of Nazism']

# Evaluation

Change next cell if you want to use only the top rated books to calculate most similar titles (True). Else, it will use all the books (False).

In [17]:
useRatings = False

# Number of books to take as the most popular ones
top = 10

Define a function that calculates a total rating value for each book:

*RatingValue = N5 * 5 + N4 * 4*

Where N5 and N4 are the total number of ratings of 5 and 4 stars respectively.


In [18]:
def mult(a, b):
  return int(a.split(':')[1]) * 5 + int(b.split(':')[1]) * 4

In [19]:
if useRatings:
  # Add column with calculated total rating value
  book_rating_author['RatingValue'] = book_rating_author.apply(lambda row : mult(row['RatingDist5'], row['RatingDist4']), axis = 1)

  # Sort books by rating value and remove duplicates
  ratings = book_rating_author.sort_values(by='RatingValue', ascending=False)
  ratings = ratings.drop_duplicates(subset=['Name'])

  if len(ratings) < top:
    top = len(ratings)

  names = ratings.head(top)['Name']
else:
  names = names_author

In [20]:
# Print selected titles
print(names)

{'Rethinking Camelot', 'What Uncle Sam Really Wants', 'Getting Haiti Right This Time: The U.S. and the Coup', 'Modular Approaches to the Study of the Mind (Distinguished Graduate Research Lecture)', 'Essays on Form and Interpretation (Studies in Linguistic Analysis 2)', 'Imperial Ambitions: Conversations on the Post-9/11 World', 'Acts of Aggression: Policing Rogue States', 'Middle East Illusions: Including Peace in the Middle East? Reflections on Justice and Nationhood.', "Hegemony or Survival: America's Quest for Global Dominance", 'For a Free Humanity: For Anarchy', 'The Emerging Framework of World Power', 'American Power and the New Mandarins', 'A New Generation Draws the Line', 'Rethinking Camelot: JFK, the Vietnam War and US Political Culture', 'The Essential Chomsky', 'New War on Terrorism: Fact or Fiction', 'World Orders, Old and New', 'Pirates and Emperors, Old and New', 'Year 501: The Conquest Continues', 'The Cold War & the University: Toward an Intellectual History of the Po

**Preprocess** both lists to calculate embedding of each title

In [21]:
generated_embeddings = []
for g in generated:
  # Tokenize and get weight/embedding of each token
  text_index = tokenizer.encode(g, add_prefix_space = True)
  vecto = model.transformer.wte.weight[text_index,:]

  # Average all tokens into a single vector (768 dim)
  generated_embeddings.append(vecto.mean(dim=0))

names_embeddings = []
for n in names:
  # Tokenize and get weight/embedding of each token
  text_index = tokenizer.encode('A Guide to Flying', add_prefix_space = True)
  vecto = model.transformer.wte.weight[text_index,:]

  # Average all tokens into a single vector (768 dim)
  names_embeddings.append(vecto.mean(dim=0))

**Calculate similarity**

In [22]:
generated_similarity = []
for g in generated_embeddings:
  maxSim = 0
  for n in names_embeddings:
    
    # Calculate similarity
    sim = torch.cosine_similarity(g,n, dim=0).item()

    # Save if max similarity
    if sim > maxSim:
      maxSim = sim

  # Save best similarity for current generated title
  generated_similarity.append(maxSim)

**Final results**

In [23]:
print("\033[94m{}\033[00m" .format('Author: ' + author))
print("\nSimil.\tGenerated title")
for elem in sorted(zip(generated_similarity, generated), reverse=True):
  print("{:.2f}\t".format(elem[0])+ elem[1])

[94mAuthor: Noam Chomsky[00m

Simil.	Generated title
0.72	"Linguistic Critique of Language and Its Relationship to Ideas and Interpretations"
0.65	The Birth of Nazism
0.64	Toward a New Cold War
0.62	Empowering Others: Inspiring Educators
0.62	In this Year
0.59	The Survival of Democracies
0.59	The Bottom Line
0.57	Nonviolence in Political Culture
0.55	Topics in Logic
0.33	Apostle
