# Building the NLP model:

Now that we completed our EDA and checked our finindings using statistical tests, we can move on to our main task: Build a NLP model that takes in a video title as an input and predicts the engagement rate of the video.

In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, concatenate
from tensorflow.keras.models import Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Loading the data:

In [7]:
youtube_data = pd.read_csv('..\Data\processed_youtube.csv')

youtube_data.head(5)

Unnamed: 0,title,categoryName,view_count,likes,comment_count,time_diff,day_publish,engagement,any_emoji,title_length,processed_title,description_length,processed_description,processed_tags,total_tags
0,!@#$%$#!! || Dubov vs Carlsen || Airthings Mas...,Entertainment,13.297472,10.12194,7.681099,8 days,evening,1.338829,No,57,dubov vs carlsen airthings masters,3187,follow instagram extra content http_link dubov...,"agadmator,chess,best chess channel,best youtub...",9
1,#1 76ERS at #5 HAWKS | FULL GAME HIGHLIGHTS | ...,Sports,14.288667,9.723224,8.617581,4 days,morning,1.283591,No,59,ers hawks game highlights june,874,ers hawks game highlights june trailing pts se...,"Basketball,G League,NBA,game-0042000205",4
2,#1 BUCKS at #8 HEAT | FULL GAME 4 HIGHLIGHTS |...,Sports,14.636114,10.057152,8.189522,2 days,morning,1.246688,No,61,bucks heat game highlights april,1949,miss moment latest news trending stories highl...,"Basketball,G League,NBA,game-0042200104",4
3,#1 HEAT at #2 CELTICS | FULL GAME HIGHLIGHTS |...,Sports,14.758637,10.155879,8.169903,5 days,morning,1.241699,No,59,heat celtics game highlights,475,stay uptodate news live scores stats nba app h...,"Basketball,G League,NBA,game-0042100303",4
4,#1 HEAT at #2 CELTICS | FULL GAME HIGHLIGHTS |...,Sports,14.901673,10.374584,8.299037,5 days,morning,1.253122,No,59,heat celtics game highlights,443,stay uptodate news live scores stats nba appht...,"Basketball,G League,NBA,game-0042100306",4


In [22]:
# Preprocess the data
descs = [str(desc) for desc in youtube_data['processed_description']]
titles = youtube_data['title'].values
tags = youtube_data['processed_tags'].values
num_features = ['description_length', 'total_tags']
X_num = youtube_data[num_features].values
y = youtube_data['engagement'].values

In [23]:
# Tokenize and pad the video descriptions
max_desc_len = 1350
tokenizer = Tokenizer()
tokenizer.fit_on_texts(descs)
desc_seq = tokenizer.texts_to_sequences(descs)
desc_pad = pad_sequences(desc_seq, maxlen=max_desc_len, padding='post')

In [25]:
# Vectorize the video titles and tags
title_vec = TfidfVectorizer(max_features=5000).fit_transform(youtube_data['title']).toarray()

In [27]:
# Convert video tags to TF-IDF vectors
tags = youtube_data['processed_tags'].apply(lambda x: x.replace('non-en', '')).apply(lambda x: x if x != '' else np.nan)
tags_vec = TfidfVectorizer(max_features=5000).fit_transform(tags.dropna()).toarray()

In [28]:
# Scale the numeric features
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

In [29]:
desc_input = Input(shape=(max_desc_len,), name='desc_input')
desc_emb = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=max_desc_len, name='desc_emb')(desc_input)
desc_lstm = LSTM(64, name='desc_lstm')(desc_emb)
title_input = Input(shape=(5000,), name='title_input')
tags_input = Input(shape=(5000,), name='tags_input')
num_input = Input(shape=(2,), name='num_input')
x = concatenate([desc_lstm, title_input, tags_input, num_input], axis=1)
x = Dense(64, activation='relu')(x)
output = Dense(1, activation='linear')(x)
model = Model(inputs=[desc_input, title_input, tags_input, num_input], outputs=output)
model.compile(loss='mse', optimizer='adam')

In [31]:
# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
model.fit({'desc_input': desc_pad, 'title_input': title_vec, 'tags_input': tags_vec, 'num_input': X_num_scaled},
          youtube_data['engagement'], epochs=10, batch_size=32)

207/996 [=====>........................] - ETA: 6:25 - loss: 0.1056