# YouTube Machine Learning Model
#### Created by Randhir and Andrew

Model that will take a $90\times120$ thumbnail JPEG and title from YouTube to output a video performance metric.
The metric will be 
$$Score=\frac{Amount\ of\ Likes}{Amount\ of\ Views}$$
The idea is how many people have seen this video and decided it desires a like

##### Imports

In [None]:
import string, re, os

import numpy as np
import pandas as pd

from PIL import Image
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, TextVectorization, Embedding, Dropout, Concatenate, Input

#### Constants

In [None]:
# File Structure
dirpath = "thumbnail"

# Regex Patterns
emoji_re = re.compile(pattern="[\U000000A9-\U0010ffff]", flags = re.UNICODE)
punc_re = re.compile(pattern=f"[{re.escape(string.punctuation)}]", flags = re.UNICODE)

# Download Stopwords & pattern
nltk.download('stopwords')
stopwords_list = stopwords.words("english")
sw_re = re.compile(pattern=f'(?:{"|".join([f"{re.escape(sw)}" for sw in stopwords_list])})', flags = re.UNICODE)

# Text Model Settings
max_features = 20000
embedding_dim = 128
sequence_length = 500

[!"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Data Processing
This process involves text processing and image processing. This will involve text standardization and vectorization. For the image, it needs to be processed and normalized.

In [None]:
# Text Processing
def text_standardization(raw_strs):
	lower = tf.strings.lower(raw_strs)
	emojiless = tf.strings.regex_replace(lower, emoji_re, "")
	stopwrdless = tf.strings.regex_replace(emojiless, sw_re, "")
	punctuationless = tf.strings.regex_replace(stopwrdless, punc_re, "")
	return punctuationless

# Vectorization Layer
vectorize_layer = TextVectorization(
    standardize=text_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

#

In [None]:
# Image Processing
files = [f for f in os.listdir(dirpath) if os.path.isfile(f"{dirpath}/{f}") and f.endswith(".jpg")]
images = np.zeros((len(files), 90, 120, 3))
image_ids = []
i = 0
for f in files:
	try:
		im = Image.open(f"{dirpath}/{f}")
		images[i] = np.array(im)
		image_ids.append(f[:-4])
		im.close()
	except:
		pass

	i += 1

# Normalize Pixels
images /= 256.0

In [None]:
print(image_ids)

df = pd.read_csv("data-processed.csv", index_col="yt-id")

df.loc[image_ids]

['--0Ox-hrejo', '--1xCtsh7Us', '--6a9YSe9qA', '--bVMMX-1CQ', '--BXP_ke0CM', '--I2s-FJHuU', '--MHDl8Vpvc', '--OO24STc8M', '--sVtMjIRaM', '--UIFO_XFCc', '--V6L-rKtYw', '--YhPa7uAAs', '--_V4F81pLw', '-0HQ6ANwqN8', '-0HV9Yeheic', '-0iH6trp7C4', '-0jnQlYr8XM', '-0jQ-wq1cYU', '-0M8xzPW2UU', '-0Mf-O0YAqw', '-0nO-d3KeoI', '-0o9-liMIFQ', '-0OX-gWtXKs', '-0TXol325yY', '-1-Pi3-Hx3w', '-16_MXV5Rvw', '-1BXD0KD13I', '-1Dd-ZBeILQ', '-1EkI1_ZvXw', '-1kW3c3Vh-8', '-1meBI15_aw', '-1NU88QC3uE', '-1ogkubvZy8', '-1rakblqEeg', '-1XdRHB-i8Y', '-1yKeKFXAtE', '-2AlBswOsRs', '-2Cug87W6WA', '-2eeRukv7RY', '-2f2-ckHU9c', '-2igPlVn65A', '-2kW03aqiYQ', '-2LfoDlElPo', '-2Oi-0HfTOg', '-2Pu-UJN5_4', '-2QNgzNpMB0', '-2QSN1z2RPQ', '-2r-Ia3-FIs', '-2r9wZjckOg', '-2WfSZM-7tk', '-2Wm-QN2yME', '-2XDftG8GAY', '-2Zf5bGdIAU', '-31dLNjusEc', '-3bnPtCuy7g', '-3eq0qu-ipM', '-3FIrcLxQNM', '-3jr-FLwxLs', '-3JZ-3VCWtA', '-3jZ-Gyu15Y', '-3okSz5-K0o', '-3UIUpnp85M', '-3Vr-jHtreA', '-3Vr-ltBEOk', '-3XGYMr-D3I', '-3YN-PsLwMM', '-3_7LmpR

71468

##### Model

Commonly, the Sequential API is used to train a model. However, due to the need for more than one input, the Functional API must be employed.

In [18]:
# Image Portion
img_model = Input((90, 120, 3))
img_model = Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform')(img_model)
img_model = MaxPooling2D((2, 2))(img_model)
img_model = Flatten()(img_model)
img_model = Dense(100, activation='relu', kernel_initializer='he_uniform')(img_model)

# Text Portion
text_model = Input((1,))
text_model = Embedding()(text_model)
text_model = Dropout(0.5)(text_model)
text_model = Dense(100, activation='relu', kernel_initializer='he_uniform')(text_model)

# United Model
united_model = Concatenate()([img_model, text_model])
united_model = Dense(10, activation='relu', kernel_initializer='he_uniform')(united_model)
united_model = Dense(1, activation='softmax')(united_model)

united_model.compile()

NameError: name 'Embedding' is not defined