# YouTube Machine Learning Model
#### Created by Randhir and Andrew

Model that will take a $90\times120$ thumbnail JPEG and title from YouTube to output a video performance metric.
The metric will be 
$$Score=\log{(View\ Count + 1)}$$
The idea is that the video that attracted more views is a good video. The value is log-scaled as the higher the view count, the less meaningful it becomes. This value will be normalized with the maximum value in the dataset.

##### Imports

In [29]:
import string, re, os, json, random
import urllib.request
import urllib.error
from pathlib import Path

import numpy as np
import pandas as pd

from PIL import Image
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from keras.backend import clear_session
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, TextVectorization, Embedding, Dropout, Concatenate, Input
from keras import Model
from sklearn.model_selection import KFold

from dotenv import load_dotenv

from datagen import ThumbnailDataGenerator

# Load .env file with your api key
load_dotenv()

True

#### Constants
This cell contains the constants used by this model

In [4]:
# File Structure
dirpath = "thumbnail"
modeldir = "models"
datafile = "data-filtered.csv"

# Data Aquisition
filepath = "data.csv"
count = 50
max_iterations = 100 # 50 * 100 = 5000 videos
topic_id = "/m/03hf_rm" # Strategy Games
lang = "en"
API_KEY = os.getenv("APIKEY")

# Regex Patterns
emoji_re = "[\U000000A9-\U0010ffff]"
punc_re = f"[{re.escape(string.punctuation)}]"

# Download Stopwords & pattern
nltk.download('stopwords')
stopwords_list = stopwords.words("english")
sw_re = f'(?:{"|".join([f"{re.escape(sw)}" for sw in stopwords_list])})'

# Text Model Settings
max_features = 20000
embedding_dim = 128
sequence_length = 500

# KFold Settings
n_folds = 5
epochs = 1000
batch_size = 30

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Data Aquisition
The YouTube API is used to get video data. This includes a video's thumbnail and metrics.

In [None]:
# Check if data file already exist
if os.path.isfile(filepath):
    df = pd.read_csv(filepath, index_col="yt-id")
else:
    df = pd.DataFrame([], columns=["yt-id", "title", "created", "channel-id", "thumbnail", "thumbnail-w", "thumbnail-h", "view-count", "like-count", "comment-count", "query"])
    df = df.set_index("yt-id")
    
# Grab missing data IDs for query
yt_ids = list(df[df["view-count"].isna()].index)

# Loop
yt_reads = 0
for i in range(max_iterations):
    try:
        # Check if any stats calls are needed
        if len(yt_ids) > 0:
            # Message 
            print("Pulling statistics for missing data values")

            # Split up batch by 50 if needed
            for index_split in range(50, len(yt_ids) + 1, 50):
                # Generate & call statistic query (1 unit)
                urlData_stats = f"https://www.googleapis.com/youtube/v3/videos?key={API_KEY}&part=statistics&id={','.join(yt_ids[index_split - 50:index_split])}"
                webURL_stats = urllib.request.urlopen(urlData_stats)
                raw_stats_data = webURL_stats.read()
                results_stats = json.loads(raw_stats_data.decode(webURL_stats.info().get_content_charset('utf-8')))

                # Process Stats Response
                for stats_data in results_stats["items"]:
                    try:
                        # Parse data
                        new_row = pd.DataFrame([{
                            "yt-id": stats_data['id'],
                            "view-count": stats_data['statistics']['viewCount'],
                            "like-count": stats_data['statistics']['likeCount'] if 'likeCount' in stats_data['statistics'] else "",
                            "comment-count": stats_data['statistics']['commentCount'] if 'commentCount' in stats_data['statistics'] else "",
                        },])
                        new_row = new_row.set_index("yt-id")

                        # Update main dataset
                        df.update(new_row)
                    except KeyError:
                        # Weird Entry
                        continue

            # Reset after used
            yt_ids = [] 

            # Message 
            print("Finished pulling statistics for current batch")

        # Message
        print(f"Pulling {count} random videos")

        # Generates random query for YT
        r_q = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(3))

        # Calls the API for search results (100 units)
        urlData_query = f"https://www.googleapis.com/youtube/v3/search?key={API_KEY}&maxResults={count}&part=snippet&type=video&relevanceLanguage={lang}&topicId={topic_id}&q={r_q}"
        webURL_query = urllib.request.urlopen(urlData_query)
        raw_vid_data = webURL_query.read()
        results_vids = json.loads(raw_vid_data.decode(webURL_query.info().get_content_charset('utf-8')))

        # Process Video Response
        for video_data in results_vids['items']:
            # Ignore Live and Upcoming Content (no ratings yet)
            if video_data['snippet']['liveBroadcastContent'] != "none":
                continue

            # Parse data
            try:
                new_row = pd.DataFrame([{
                    "yt-id": video_data['id']['videoId'],
                    "title": video_data['snippet']['title'],
                    "created": video_data['snippet']['publishedAt'],
                    "channel-id": video_data['snippet']['channelId'],
                    "thumbnail": video_data['snippet']['thumbnails']["default"]["url"],
                    "thumbnail-w": video_data['snippet']['thumbnails']["default"]["width"],
                    "thumbnail-h": video_data['snippet']['thumbnails']["default"]["height"],
                    "query": r_q,
                },])
                new_row = new_row.set_index("yt-id")

                try:
                    # Append
                    df = pd.concat([df, new_row], verify_integrity=True)

                    # Store your ids
                    yt_reads += 1

                    # Prepare id for stats query
                    yt_ids.append(video_data['id']['videoId'])
                except ValueError:
                    # Duplicate video detected
                    continue
            except KeyError:
                # Weird Entry
                continue

        # Update User
        print(f"API call #{i + 1} successfully")

        # Dumb Data to prevent loss every 5 runs
        if i % 5 == 0:
            df.to_csv(filepath)

    # ON API failure, quit and save
    except urllib.error.HTTPError:
        print("Latest API call failed. You are likely out of units. Try again tomorrow.")
        break
    
# Write to csv
df.to_csv(filepath)

# Termination
print(f"Was able to pull {yt_reads} rows")

#### Data Processing
This process involves text processing and image processing. This will involve text standardization and vectorization. For the image, it needs to be processed and normalized.

In [10]:
raw_data = pd.read_csv(datafile, index_col="yt-id")
raw_data.describe()

Unnamed: 0,thumbnail-w,thumbnail-h,view-count,like-count,comment-count
count,35757.0,35757.0,35299.0,34310.0,35026.0
mean,120.0,90.0,92815.97,2302.683,115.941187
std,0.0,0.0,1687363.0,31164.37,1640.474869
min,120.0,90.0,0.0,0.0,0.0
25%,120.0,90.0,30.0,1.0,0.0
50%,120.0,90.0,282.0,8.0,1.0
75%,120.0,90.0,3154.5,75.0,12.0
max,120.0,90.0,223299600.0,2686147.0,146332.0


In [29]:
# Text Processing
def text_standardization(raw_strs):
	lower = tf.strings.lower(raw_strs)
	emojiless = tf.strings.regex_replace(lower, emoji_re, "")
	stopwrdless = tf.strings.regex_replace(emojiless, sw_re, "")
	punctuationless = tf.strings.regex_replace(stopwrdless, punc_re, "")
	return punctuationless

# Vectorization Layer
vectorize_layer = TextVectorization(
    standardize=text_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [8]:
# Filter Images
files = [f for f in os.listdir(dirpath) if os.path.isfile(f"{dirpath}/{f}") and f.endswith(".jpg")]
image_ids = []
i = 0
for f in files:
	im = None
	try:
		im = Image.open(f"{dirpath}/{f}")

		if im.size != (120, 90):
			im.close()
			Path.unlink(f"{dirpath}/{f}")
			print(f"{f} deleted")
			continue

		im.close()
		im = None

		# Save valid indexes for filtering
		image_ids.append(f[:-4])
	except:
		# Close bad files
		if im is not None:
			im.close()
			im = None

		# Delete Bad Files
		Path.unlink(f"{dirpath}/{f}")
		print(f"{f} deleted")

	i += 1

02v-CVttnS0.jpg deleted
0EZUP5Vtemw.jpg deleted
4KlB4i4dEWU.jpg deleted
6JhUQpe-J6U.jpg deleted
bAHQy0QFUMI.jpg deleted
DcejDtVA4MU.jpg deleted
E0Hchyxwr4c.jpg deleted
ffLdLgSbpEc.jpg deleted
hstJLLvhYSM.jpg deleted
htE2M7shdfI.jpg deleted
JTwsU2dDpEg.jpg deleted
Lpnw6hMIu24.jpg deleted
Q9D-aQzRuU4.jpg deleted
RPoQZ_926hQ.jpg deleted
Sl2ueV8kRRU.jpg deleted
StkNJFSGksg.jpg deleted
tnAYVF1-q74.jpg deleted
VDg_U-n3t-I.jpg deleted
X82cgnMGeD8.jpg deleted
XO6KolPTH8U.jpg deleted
XY6Iw4kTOEI.jpg deleted
yQKNzY4HGGg.jpg deleted
ZBbw3WfcxN8.jpg deleted


In [4]:
# Image Processing
files = [f for f in os.listdir(dirpath) if os.path.isfile(f"{dirpath}/{f}") and f.endswith(".jpg")]
images = np.zeros((len(files), 90, 120, 3))
image_ids = []
valid_index = []
i = 0
for f in files:
	try:
		im = Image.open(f"{dirpath}/{f}")
		images[i] = np.array(im)
		image_ids.append(f[:-4])
		im.close()

		# Save valid indexes for filtering
		valid_index.append(i)
	except:
		pass

	i += 1

# Filter
images = images[valid_index]

# Normalize Pixels
images /= 255.0

In [14]:
ids_with_pic = [f[:-4] for f in os.listdir(dirpath) if os.path.isfile(f"{dirpath}/{f}") and f.endswith(".jpg")]
raw_data = raw_data.loc[ids_with_pic]

# Label Processing
scores = raw_data["view-count"] # Grab View Count
scores = scores.fillna(0.0) # Replace NaN with 0
scores = scores.map(lambda x : np.log10(x + 1)) # Log everything to make it less extreme
scores = scores.div(scores.max()) # Normalized

scores.describe()

count    35734.000000
mean         0.306272
std          0.172596
min          0.000000
25%          0.173335
50%          0.290248
75%          0.416793
max          1.000000
Name: view-count, dtype: float64

In [None]:
tbdg = ThumbnailDataGenerator(dirpath, ids_with_pic, scores)
x, y = tbdg[0]

print(len(tbdg))
print(x.shape)
print(x[1,34,113,2])

1116
(32, 90, 120, 3)
0.6196078431372549


##### Model

Commonly, the Sequential API is used to train a model. However, due to the need for more than one input, the Functional API must be employed.

In [8]:
# Image Portion
img_input = Input((90, 120, 3))
x = Conv2D(16, 3, activation='relu', kernel_initializer='he_uniform')(img_input)
x = MaxPooling2D()(x)
x = Conv2D(32, 3, activation='relu', kernel_initializer='he_uniform')(x)
x = MaxPooling2D()(x)
x = Conv2D(64, 3, activation='relu', kernel_initializer='he_uniform')(x)
x = MaxPooling2D()(x)
x = Flatten()(x)
x = Dense(100, activation='relu', kernel_initializer='he_uniform')(x)
img_output = Dense(1, activation='softmax')(x)

img_model = Model(inputs=img_input, outputs=img_output, name="img_model")

img_model.summary()

Model: "img_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 90, 120, 3)]      0         
                                                                 
 conv2d (Conv2D)             (None, 88, 118, 16)       448       
                                                                 
 max_pooling2d (MaxPooling2  (None, 44, 59, 16)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 42, 57, 32)        4640      
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 21, 28, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 19, 26, 64)        18

In [9]:
# Text Portion
text_input = Input((1,), dtype=tf.string)
y = Embedding(max_features, embedding_dim)(text_input)
y = Dropout(0.5)(y)
y = Flatten()(y)
y = Dense(100, activation='relu', kernel_initializer='he_uniform')(y)
text_output = Dense(1, activation='softmax')(y)

text_model = Model(inputs=text_input, outputs=text_output, name="text_model")

text_model.summary()

Model: "text_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 embedding (Embedding)       (None, 1, 128)            2560000   
                                                                 
 dropout (Dropout)           (None, 1, 128)            0         
                                                                 
 flatten_1 (Flatten)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 100)               12900     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 2573001 (9.82 MB)
Trainable params: 257300

In [10]:
# United Model
z = Concatenate()([x, y])
z = Dense(10, activation='relu', kernel_initializer='he_uniform')(z)
z = Dense(1, activation='softmax')(z)

united_model = Model(inputs=[img_input, text_input], outputs=z, name="unitied_model")

united_model.summary()

Model: "unitied_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 90, 120, 3)]         0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 88, 118, 16)          448       ['input_1[0][0]']             
                                                                                                  
 max_pooling2d (MaxPooling2  (None, 44, 59, 16)           0         ['conv2d[0][0]']              
 D)                                                                                               
                                                                                                  
 conv2d_1 (Conv2D)           (None, 42, 57, 32)           4640      ['max_pooling2d[0]

#### Training
Using k-fold cross validation, we can judge the accuarcy of this model

In [None]:
# Image Model
kf = KFold(n_folds)

validation_accuracy = []
validation_loss = []

fold_var = 1
for train, val in kf.split(images, scores):
	# Make image model for testing
	img_model = Model(inputs=img_input, outputs=img_output, name="img_model")
	img_model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

	# Callback Saving
	checkpoint = ModelCheckpoint(f"{modeldir}/model_{fold_var}.h5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

	# Fit
	history = img_model.fit(images[train], scores[train], callbacks=[checkpoint], epochs=epochs)

	# Grab Results
	img_model.load_weights(f"{modeldir}/model_{fold_var}.h5")
	
	results = img_model.evaluate(images[val], scores[val])
	results = dict(zip(img_model.metrics_names, results))
	
	validation_accuracy.append(results['accuracy'])
	validation_loss.append(results['loss'])
	
	# Clear
	clear_session()

	# Increment
	fold_var += 1

MemoryError: Unable to allocate 6.90 GiB for an array with shape (28587, 90, 120, 3) and data type float64