# day 9 of #66daysofdata_NLP
## Machine Learning Model for `Title Generation`


* ref: 
    - [analyticsvidhya.com](https://www.analyticsvidhya.com/blog/2021/09/building-a-machine-learning-model-for-title-generation/)
* data set:
    - [Trending YouTube Video Statistics](https://www.kaggle.com/datasnaek/youtube-new) 
    - Context:
        - This dataset is a daily record of the top trending YouTube videos.
        - This dataset includes several months (and counting) of data on daily trending YouTube videos. Data is included for the US, GB, DE, CA, and FR regions (USA, Great Britain, Germany, Canada, and France, respectively), with up to 200 listed trending videos per day.


### Importing the necessary libraries

In [10]:
import pandas as pd
import string
import numpy as np
import json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku
import tensorflow as tf
tf.random.set_seed(2)
from numpy.random import seed
seed(1)

###  Load data set

In [2]:
#load all the datasets 
df1 = pd.read_csv('../datasets/youtube/USvideos.csv')
df2 = pd.read_csv('../datasets/youtube/CAvideos.csv')
df3 = pd.read_csv('../datasets/youtube/GBvideos.csv')

#load the datasets containing the category names
data1 = json.load(open('../datasets/youtube/US_category_id.json'))
data2 = json.load(open('../datasets/youtube/CA_category_id.json'))
data3 = json.load(open('../datasets/youtube/GB_category_id.json'))

In [3]:
display(df1.head(2))

display(pd.DataFrame(data1).head(2))

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."


Unnamed: 0,kind,etag,items
0,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
1,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."


### clean up and process the data:

* Now we must process our data in order to utilize it to train our machine learning model for the purpose of topic generation.


In [4]:
def category_extractor(data):
    i_d = [data['items'][i]['id'] for i in range(len(data['items']))]
    title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))]
    i_d = list(map(int, i_d))
    category = zip(i_d, title)
    category = dict(category)
    return category

#create a new category column by mapping the category names to their id
df1['category_title'] = df1['category_id'].map(category_extractor(data1))
df2['category_title'] = df2['category_id'].map(category_extractor(data2))
df3['category_title'] = df3['category_id'].map(category_extractor(data3))

#join the dataframes
df = pd.concat([df1, df2, df3], ignore_index=True)

#drop rows based on duplicate videos
df = df.drop_duplicates('video_id')

#collect only titles of entertainment videos
#feel free to use any category of video that you want
entertainment = df[df['category_title'] == 'Entertainment']['title']
entertainment = entertainment.tolist()

In [5]:
display(pd.DataFrame( {'category_title_Entertainment':entertainment}))

Unnamed: 0,category_title_Entertainment
0,The Trump Presidency: Last Week Tonight with J...
1,Nickelback Lyrics: Real or Fake?
2,I Dare You: GOING BALD!?
3,Roy Moore & Jeff Sessions Cold Open - SNL
4,(SPOILERS) 'Shiva Saves the Day' Talked About ...
...,...
9725,[SHINee - Good Evening] Comeback Stage | M COU...
9726,JUSTICE LEAGUE Is Better Than Infinity Wars | ...
9727,"Diddy & King Combs on The Four, Rap Beef, NFL ..."
9728,Hilary Duff Is Having a Baby Girl and Her Son ...


In [6]:
#remove punctuations and convert text to lowercase
def clean_text(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    
    text = text.encode('utf8').decode('ascii', 'ignore')
    return text

corpus = [clean_text(e) for e in entertainment]

In [7]:
for i, sentence in enumerate(corpus):
    print(f"Sample sentence {i}: '{sentence}'") if i <3 else False

Sample sentence 0: 'the trump presidency last week tonight with john oliver hbo'
Sample sentence 1: 'nickelback lyrics real or fake'
Sample sentence 2: 'i dare you going bald'


### Generating sequences using  Tokenizer  API

In [14]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    #get tokens
    tokenizer.fit_on_texts(corpus)

    total_words = len(tokenizer.word_index) + 1

    #convert to sequence of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    return input_sequences, total_words
inp_sequences, total_words = get_sequence_of_tokens(corpus)

In [15]:
print(f"len of inp_sequences is '{len(inp_sequences)}' and the corpus has '{total_words}' words")

len of inp_sequences is '70438' and the corpus has '13915' words


In [16]:
# How does it work?
print(f'sample corpus sentence:\n{corpus[0]}its sequence_of_tokens: ')
get_sequence_of_tokens([corpus[0]])[0]

sample corpus sentence:
the trump presidency last week tonight with john oliver hboits sequence_of_tokens: 


[[1, 87],
 [1, 87, 3083],
 [1, 87, 3083, 70],
 [1, 87, 3083, 70, 353],
 [1, 87, 3083, 70, 353, 1179],
 [1, 87, 3083, 70, 353, 1179, 11],
 [1, 87, 3083, 70, 353, 1179, 11, 135],
 [1, 87, 3083, 70, 353, 1179, 11, 135, 991],
 [1, 87, 3083, 70, 353, 1179, 11, 135, 991, 1432]]

### Padding the sequences 

In [17]:
def generate_padded_sequences(input_sequences):
    
    # finding the max length of input sequence lists
    max_sequence_len = max([len(x) for x in input_sequences])
    
    # The pad_sequences() is a function in the Keras deep learning library that can be used to pad variable-length sequences 
    input_sequences = np.array(pad_sequences(input_sequences,  maxlen=max_sequence_len, padding='pre'))
    
    # use last item of every input sequence as our moder label for next word prediction task
    predictors, label = input_sequences[:,:-1], input_sequences[:, -1]
    
    # convert labels to categorical data
    label = ku.to_categorical(label, num_classes = total_words)
    
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [18]:
print(f"Input shape: {predictors.shape}\nOutput shape: {label.shape}\nmax sequence length: {max_sequence_len}")

Input shape: (70438, 26)
Output shape: (70438, 13915)
max sequence length: 27


## LSTM Model for Title Generation
The LSTM of this model consists of three layers:

    Input layer: takes the word order as input
    LSTM Layout: Calculate output using LSTM units.
    Disposal layer: a regular layer to avoid overheating
    Output layer: determines whether the next word may be output



In [30]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()

    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))

    # Add Hidden Layer 1 — LSTM Layer
    model.add((LSTM(100)))
    model.add(Dropout(0.1))

    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

# create the model
model = create_model(max_sequence_len, total_words)

# fit the model
model.fit(predictors, label, epochs=20,verbose=1)
print('Now that our title generator learning model is ready ')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Now that our title generator learning model is ready 


In [55]:
from packaging import version
def generate_text(seed_text, next_words, model, max_sequence_len):
    for i in range(next_words -1):
        
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1,  padding='pre')
        
        if version.parse(tf.__version__) >= version.parse('2.6.0'):
            predicted = np.argmax(model.predict(token_list), axis=-1)[0]            
        else:
            predicted = model.predict_classes(token_list, verbose=0)

        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break       
        
        seed_text += " "+output_word
    return seed_text.title()

In [60]:
key_words = ['Drake', 'united states', 'Spider man', 'Donald Trump', 'Sara', 'Minnesota']
title_len = [5, 3, 5, 6, 4, 3]

for key_word, t_len in zip(key_words, title_len):
    print (f"Input: '{key_word}' ---> Output: '{generate_text(key_word, t_len, model, max_sequence_len)}'\n")


Input: 'Drake' ---> Output: 'Drake Talks On His Own'

Input: 'united states' ---> Output: 'United States Promo Scene'

Input: 'Spider man' ---> Output: 'Spider Man On The Breakfast Club'

Input: 'Donald Trump' ---> Output: 'Donald Trump To Missed In The World'

Input: 'Sara' ---> Output: 'Sara Williams Is Back'

Input: 'Minnesota' ---> Output: 'Minnesota Vs Williams'

