# YouTube Machine Learning Model
#### Created by Randhir and Andrew

Model that will take a $90\times120$ thumbnail JPEG and title from YouTube to output a video performance metric.
The metric will be 
$$Score=\frac{Amount\ of\ Likes}{Amount\ of\ Views}$$
The idea is how many people have seen this video and decided it desires a like

##### Imports

In [48]:
import string, re, os

import numpy as np
import pandas as pd

from PIL import Image
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, TextVectorization, Embedding, Dropout, Concatenate, Input
from keras import Model
from sklearn.model_selection import KFold


#### Constants

In [47]:
# File Structure
dirpath = "thumbnail"
datafile = "data-filtered.csv"

# Regex Patterns
emoji_re = "[\U000000A9-\U0010ffff]"
punc_re = f"[{re.escape(string.punctuation)}]"

# Download Stopwords & pattern
nltk.download('stopwords')
stopwords_list = stopwords.words("english")
sw_re = f'(?:{"|".join([f"{re.escape(sw)}" for sw in stopwords_list])})'

# Text Model Settings
max_features = 20000
embedding_dim = 128
sequence_length = 500

# KFold Settings
n_folds = 5

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Data Processing
This process involves text processing and image processing. This will involve text standardization and vectorization. For the image, it needs to be processed and normalized.

In [51]:
raw_data = pd.read_csv(datafile, index_col="yt-id")
raw_data.describe()

Unnamed: 0,thumbnail-w,thumbnail-h,view-count,like-count,comment-count
count,35757.0,35757.0,35299.0,34310.0,35026.0
mean,120.0,90.0,92815.97,2302.683,115.941187
std,0.0,0.0,1687363.0,31164.37,1640.474869
min,120.0,90.0,0.0,0.0,0.0
25%,120.0,90.0,30.0,1.0,0.0
50%,120.0,90.0,282.0,8.0,1.0
75%,120.0,90.0,3154.5,75.0,12.0
max,120.0,90.0,223299600.0,2686147.0,146332.0


In [29]:
# Text Processing
def text_standardization(raw_strs):
	lower = tf.strings.lower(raw_strs)
	emojiless = tf.strings.regex_replace(lower, emoji_re, "")
	stopwrdless = tf.strings.regex_replace(emojiless, sw_re, "")
	punctuationless = tf.strings.regex_replace(stopwrdless, punc_re, "")
	return punctuationless

# Vectorization Layer
vectorize_layer = TextVectorization(
    standardize=text_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [7]:
# Image Processing
files = [f for f in os.listdir(dirpath) if os.path.isfile(f"{dirpath}/{f}") and f.endswith(".jpg")]
images = np.zeros((len(files), 90, 120, 3))
image_ids = []
i = 0
for f in files:
	try:
		im = Image.open(f"{dirpath}/{f}")
		images[i] = np.array(im)
		image_ids.append(f[:-4])
		im.close()
	except:
		pass

	i += 1

# Normalize Pixels
images /= 255.0

In [85]:
# Label Processing
scores = raw_data["view-count"] # Grab View Count
scores = scores[image_ids] # Filter for images that we have
scores = scores.fillna(0.0) # Replace NaN with 0
scores = scores.map(lambda x : np.log10(x + 1)) # Log everything to make it less extreme
scores /= scores.max() # Normalized

scores.describe()

count    35734.000000
mean         0.306272
std          0.172596
min          0.000000
25%          0.173335
50%          0.290248
75%          0.416793
max          1.000000
Name: view-count, dtype: float64

##### Model

Commonly, the Sequential API is used to train a model. However, due to the need for more than one input, the Functional API must be employed.

In [46]:
# Image Portion
img_input = Input((90, 120, 3))
x = Conv2D(16, 3, activation='relu', kernel_initializer='he_uniform')(img_input)
x = MaxPooling2D()(x)
x = Conv2D(32, 3, activation='relu', kernel_initializer='he_uniform')(x)
x = MaxPooling2D()(x)
x = Conv2D(64, 3, activation='relu', kernel_initializer='he_uniform')(x)
x = MaxPooling2D()(x)
x = Flatten()(x)
x = Dense(100, activation='relu', kernel_initializer='he_uniform')(x)
img_output = Dense(1, activation='softmax')(x)

img_model = Model(inputs=img_input, outputs=img_output, name="img_model")

img_model.summary()
img_model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

Model: "img_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_16 (InputLayer)       [(None, 90, 120, 3)]      0         
                                                                 
 conv2d_16 (Conv2D)          (None, 88, 118, 16)       448       
                                                                 
 max_pooling2d_16 (MaxPooli  (None, 44, 59, 16)        0         
 ng2D)                                                           
                                                                 
 conv2d_17 (Conv2D)          (None, 42, 57, 32)        4640      
                                                                 
 max_pooling2d_17 (MaxPooli  (None, 21, 28, 32)        0         
 ng2D)                                                           
                                                                 
 conv2d_18 (Conv2D)          (None, 19, 26, 64)        18

In [39]:
# Text Portion
text_input = Input((1,), dtype=tf.string)
y = Embedding(max_features, embedding_dim)(text_input)
y = Dropout(0.5)(y)
y = Dense(100, activation='relu', kernel_initializer='he_uniform')(y)
text_output = Dense(1, activation='softmax')(y)

text_model = Model(inputs=text_input, outputs=text_output, name="text_model")

text_model.summary()

Model: "text_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 1)]               0         
                                                                 
 embedding_3 (Embedding)     (None, 1, 128)            2560000   
                                                                 
 dropout_3 (Dropout)         (None, 1, 128)            0         
                                                                 
 dense_16 (Dense)            (None, 1, 100)            12900     
                                                                 
 dense_17 (Dense)            (None, 1, 1)              101       
                                                                 
Total params: 2573001 (9.82 MB)
Trainable params: 2573001 (9.82 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [38]:
# United Model
z = Concatenate()([x, y])
z = Dense(10, activation='relu', kernel_initializer='he_uniform')(z)
z = Dense(1, activation='softmax')(z)

united_model = Model(inputs=[img_input, text_input], outputs=z, name="unitied_model")

united_model.summary()

ValueError: A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 100), (None, 1, 100)]

#### Training
Using k-fold cross validation, we can judge the accuarcy of this model

In [86]:
# Image Model
kf = KFold(n_folds)

validation_accuracy = []
validation_loss = []

for train, val in kf.split(images, scores):
	pass

ValueError: Found input variables with inconsistent numbers of samples: [35757, 35734]