In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, TimeDistributed, Embedding, LSTM, Bidirectional, Dropout, Dense, GRU, concatenate, Conv1D, Conv2D, Flatten,MaxPooling1D, MaxPooling2D, LocallyConnected1D, Activation, GaussianNoise, BatchNormalization
from tensorflow.keras.models import Model
import nltk
from AttentionWeightedAverage import AttentionWeightedAverage
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer

- NLP on Title + Description to Tags, in order to build a tag recommendation algorithm
- NLP on Tags + Description + Category to Title, in order to provide title recommendation algorithm. These videos are popular videos, meaning they have appealing titles. Let's build a system, that judging by content, can provide the most appropriate title.

In [2]:
df = pd.read_csv("./data/USvideos.csv")
print('Number of youtube videos in dataframe: ', len(df.index))
df.head()

# count category occurences
occurences = df['category_id'].value_counts()
# category names
map_categories = {}
with open('data/US_category_id.json') as json_file:
    data_category = json.load(json_file)
    for category in data_category['items']:
        int_id = int(category['id'])
        occ = 0
        if int_id in occurences:
            occ = occurences[int_id]
        map_categories[int_id] = [category['snippet']['title'],occ]
# create new dataframe with categories and appearances
df_categories = pd.DataFrame.from_dict(map_categories, orient='index', columns=['category_name', 'count'])
# add category name on the main dataframe
for i in map_categories.keys():
    df.loc[df['category_id'] == i,'category_name'] = map_categories[i][0]

Number of youtube videos in dataframe:  40949


### Tags, Description and Category -> Auto generate Title

In [3]:
def discretize(tags):
    tags = tags.replace('"','')
    discrete_tags = tags.split('|')
    return discrete_tags

In [27]:
df_text = df[['title', 'description', 'tags', 'category_name']]
df_text = df_text.dropna(axis=0, how='any')

titles = df_text['title'].tolist()
descriptions = df_text['description'].tolist()
tags = [discretize(row['tags']) for index, row in df_text.iterrows()]
categories = df_text['category_name'].tolist()

- Titles need to be tokenized, lowercased, categorical.
- Descriptions need to be tokenized, lowercased, categorical.
- Tags need to be lowercased, categorical.

In [36]:
clean_desc = []
clean_title = []
for desc in descriptions:
    desc = desc.replace('\\n', ' ')
    desc = desc.lower()
    desc = desc.replace('www.facebook.com', 'FacebookLink')
    desc = desc.replace('http.facebook.com', 'FacebookLink')
    desc = desc.replace('www.twitter.com', 'TwitterLink')
    desc = desc.replace('www.amazon.com', 'AmazonLink')
    desc = desc.replace('http.amzn.com', 'AmazonLink')
    desc = desc.replace('www.instagram.com', 'InstagramLink')
    desc = desc.replace('www.snapchat.com', 'SnapchatLink')
    desc = desc.replace('https://bit.ly', 'PortalLink')
    clean_desc.append(text_to_word_sequence(desc, filters='!"#%&()*-+,.\\/:;<=>?@[\\]^_`{|}~\t\n♡▶➜', lower=True, split=' '))
    
for title in titles:
    title = title.replace('\\n', ' ')
    clean_title.append(text_to_word_sequence(title, filters='!"#%&()*-+,.\\/:;<=>?@[\\]^_`{|}~\t\n♡▶➜', lower=True, split=' '))

Training an unsupervised method for word2vec with gensim, in order to provide me with very good embeddings

In [38]:
from gensim.models import Word2Vec

gensim_model = Word2Vec(size=100,
                       window=5,
                       min_count=1,
                       sg=1,
                       hs=0,
                       negative=10,
                       workers=4)
gensim_model.build_vocab(clean_desc+clean_title+tags)
gensim_model.train(clean_desc+clean_title+tags, total_examples=len(clean_desc+clean_title+tags), epochs=10)

(68364433, 79944630)

Creating my new sequences with vector representations of words

In [53]:
X_train_description = []
X_train_category = []
X_train_tags = []
Y_train_title = []

lb = LabelBinarizer()
lb.fit(list(set(categories)))

maxLenDesc = 0
for desc in clean_desc:
    X_train_description.append(np.asarray([gensim_model.wv[word] for word in desc]))
    if len(desc) > maxLenDesc:
        maxLenDesc = len(desc)
        
maxLenTags = 0
for taglist in tags:
    X_train_tags.append(np.asarray([gensim_model.wv[tag] for tag in taglist]))
    if len(taglist) > maxLenTags:
        maxLenTags = len(taglist)

for cat in categories:
    X_train_tags.append(lb.transform([cat]))

maxLenTitle = 0
for title in clean_title:
    Y_train_title.append(np.asarray([gensim_model.wv[word] for word in title]))
    if len(title) > maxLenTitle:
        maxLenTitle = len(title)
        
# padding
X_train_description = pad_sequences(X_train_description, maxlen=maxLenDesc)
X_train_tags = pad_sequences(X_train_tags, maxlen=maxLenTags)
Y_train_title = pad_sequences(Y_train_title, maxlen=maxLenTitle)

# convert to numpy arrays
X_train_description = np.stack(X_train_description, axis=0)
X_train_category = np.stack(X_train_category, axis=0)
X_train_tags = np.stack(X_train_tags, axis=0)
Y_train_title = np.stack(Y_train_title, axis=0)

MemoryError: Unable to allocate 12.9 GiB for an array with shape (40379, 855, 100) and data type int32

#### Text GenRNN model, proposed by Andrej Karpathy.

In [8]:
def text_genrnn(vocab_size,maxLen,input_features):
    embedding = Input(shape=(maxLen,input_features), name='main_input')
    rnn_1 = LSTM(128)(embedding)
    rnn_2 = LSTM(128)(rnn_1)
    rnn_concat = concatenate(embedding, rnn_1, rnn_2)
    attention = AttentionWeightedAverage(name='attention')(rnn_concat)
    output = Dense(vocab_size, name='output', activation='softmax')(attention)
    
    model = Model(inputs=[input], outputs=[output])
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=4e-3))
    return model

In [None]:
epochs=10
batch_size=32
model.fit(X_train_description,Y_train_title,epochs=epochs,batch_size=batch_sizeverbose=1)