# YouTube Machine Learning Model
#### Created by Randhir and Andrew

Model that will take a $90\times120$ thumbnail JPEG and title from YouTube to output a video performance metric.
The metric will be 
$$Score=\log{(View\ Count + 1)}$$
The idea is that the video that attracted more views is a good video. The value is log-scaled as the higher the view count, the less meaningful it becomes. This value will be normalized with the maximum value in the dataset.

##### Imports

In [None]:
import string, re, requests, os, json, random
import urllib.request
import urllib.error
from pathlib import Path

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

import numpy as np
import pandas as pd

from PIL import Image
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tf_keras.backend import clear_session
from tf_keras.callbacks import ModelCheckpoint
from tf_keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, Concatenate, TextVectorization, Input
from tf_keras.preprocessing.text import Tokenizer
from tf_keras.preprocessing.sequence import pad_sequences
from tf_keras.utils import Sequence
from tf_keras import Model
from sklearn.model_selection import KFold
from transformers import TFAutoModel, AutoTokenizer

from dotenv import load_dotenv

# Load .env file with your api key
if not load_dotenv():
	print(".env file not found")

#### Constants
This cell contains the constants used by this model

In [58]:
# File Structure
dirpath = "thumbnail"
modeldir = "models"
datafile = "data-filtered.csv"

# Data Aquisition
filepath = "data.csv"
count = 50
max_iterations = 100 # 50 * 100 = 5000 videos
topic_id = "/m/03hf_rm" # Strategy Games
lang = "en"
API_KEY = os.getenv("APIKEY")

# Data Filtering
MULT_CSV = False
filepath = "data.csv"
filepath_2 = "data_2.csv"
filepath_final = "data-filtered.csv"
lang = "en"

# Labeling
MAX_VIEWS = 15.3e9 # Baby Shark Video (Most Viewed Video)
THRESHOLD = 0.4

# Vectorization
vectorizator_model = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(vectorizator_model)
transformer_model = TFAutoModel.from_pretrained(vectorizator_model)

# Regex Patterns
emoji_re = "[\U000000A9-\U0010ffff]"
punc_re = f"[{re.escape(string.punctuation)}]"
space_re = "\s{1,}"

# Download Stopwords & pattern
nltk.download('stopwords')
stopwords_list = stopwords.words("english")
sw_re = f'\b(?:{"|".join([f"{re.escape(sw)}" for sw in stopwords_list])})\b'

# Text Model Settings
text_input_dim = 20000
embedding_dim = 128
sequence_length = 500

# KFold Settings
n_folds = 5
epochs = 10
batch_size = 30

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

#### Data Aquisition
The YouTube API is used to get video data. This includes a video's thumbnail and metrics.

In [None]:
# Check if data file already exist
if os.path.isfile(filepath):
    df = pd.read_csv(filepath, index_col="yt-id")
else:
    df = pd.DataFrame([], columns=["yt-id", "title", "created", "channel-id", "thumbnail", "thumbnail-w", "thumbnail-h", "view-count", "like-count", "comment-count", "query"])
    df = df.set_index("yt-id")
    
# Grab missing data IDs for query
yt_ids = list(df[df["view-count"].isna()].index)

# Loop
yt_reads = 0
for i in range(max_iterations):
    try:
        # Check if any stats calls are needed
        if len(yt_ids) > 0:
            # Message 
            print("Pulling statistics for missing data values")

            # Split up batch by 50 if needed
            for index_split in range(50, len(yt_ids) + 1, 50):
                # Generate & call statistic query (1 unit)
                urlData_stats = f"https://www.googleapis.com/youtube/v3/videos?key={API_KEY}&part=statistics&id={','.join(yt_ids[index_split - 50:index_split])}"
                webURL_stats = urllib.request.urlopen(urlData_stats)
                raw_stats_data = webURL_stats.read()
                results_stats = json.loads(raw_stats_data.decode(webURL_stats.info().get_content_charset('utf-8')))

                # Process Stats Response
                for stats_data in results_stats["items"]:
                    try:
                        # Parse data
                        new_row = pd.DataFrame([{
                            "yt-id": stats_data['id'],
                            "view-count": stats_data['statistics']['viewCount'],
                            "like-count": stats_data['statistics']['likeCount'] if 'likeCount' in stats_data['statistics'] else "",
                            "comment-count": stats_data['statistics']['commentCount'] if 'commentCount' in stats_data['statistics'] else "",
                        },])
                        new_row = new_row.set_index("yt-id")

                        # Update main dataset
                        df.update(new_row)
                    except KeyError:
                        # Weird Entry
                        continue

            # Reset after used
            yt_ids = [] 

            # Message 
            print("Finished pulling statistics for current batch")

        # Message
        print(f"Pulling {count} random videos")

        # Generates random query for YT
        r_q = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(3))

        # Calls the API for search results (100 units)
        urlData_query = f"https://www.googleapis.com/youtube/v3/search?key={API_KEY}&maxResults={count}&part=snippet&type=video&relevanceLanguage={lang}&topicId={topic_id}&q={r_q}"
        webURL_query = urllib.request.urlopen(urlData_query)
        raw_vid_data = webURL_query.read()
        results_vids = json.loads(raw_vid_data.decode(webURL_query.info().get_content_charset('utf-8')))

        # Process Video Response
        for video_data in results_vids['items']:
            # Ignore Live and Upcoming Content (no ratings yet)
            if video_data['snippet']['liveBroadcastContent'] != "none":
                continue

            # Parse data
            try:
                new_row = pd.DataFrame([{
                    "yt-id": video_data['id']['videoId'],
                    "title": video_data['snippet']['title'],
                    "created": video_data['snippet']['publishedAt'],
                    "channel-id": video_data['snippet']['channelId'],
                    "thumbnail": video_data['snippet']['thumbnails']["default"]["url"],
                    "thumbnail-w": video_data['snippet']['thumbnails']["default"]["width"],
                    "thumbnail-h": video_data['snippet']['thumbnails']["default"]["height"],
                    "query": r_q,
                },])
                new_row = new_row.set_index("yt-id")

                try:
                    # Append
                    df = pd.concat([df, new_row], verify_integrity=True)

                    # Store your ids
                    yt_reads += 1

                    # Prepare id for stats query
                    yt_ids.append(video_data['id']['videoId'])
                except ValueError:
                    # Duplicate video detected
                    continue
            except KeyError:
                # Weird Entry
                continue

        # Update User
        print(f"API call #{i + 1} successfully")

        # Dumb Data to prevent loss every 5 runs
        if i % 5 == 0:
            df.to_csv(filepath)

    # ON API failure, quit and save
    except urllib.error.HTTPError:
        print("Latest API call failed. You are likely out of units. Try again tomorrow.")
        break
    
# Write to csv
df.to_csv(filepath)

# Termination
print(f"Was able to pull {yt_reads} rows")
del df

After aquiring the data, the thumbnail images need to be pulled as well.

#### Data Filtering
Some of the pulled data need to be filtered before usage. This includes potential duplicates and non english entries. 

In [None]:
# Read Data
df = pd.read_csv(filepath, index_col="yt-id")

# Merge multiple if needed
if MULT_CSV:
	df_2 = pd.read_csv(filepath_2, index_col="yt-id")
	df = pd.concat([df, df_2])

print(f"{df.size} rows in data file")

# Remove duplicates
df = df[~df.index.duplicated(keep='first')]
print(f"{df.size} rows remaining after duplication filter")

# Remove non language
def lang_filter(row) -> bool:
	try:
		print(row["title"])
		return detect(row["title"]) == lang
	except LangDetectException:
		return False
	
df = df[df.apply(lang_filter, axis=1)]
print(f"{df.size} rows remaining after translation filter")

# Save Filtered Data
df.to_csv(filepath_final)
del df

#### Thumbnail Requesting
Once the dataset has been filtered, the thumbnails can now be pulled. Images that do not fit the $90\times120$ size will be rejected.

In [None]:
# Grab data
df = pd.read_csv(filepath, index_col="yt-id")

# Make directory for image if not already
if not os.path.isdir(dirpath):
	os.mkdir(dirpath)

# Iterate thru dataframe and download
def grab_thumbnail(x : pd.Series):
	# Check if file exist
	filename = f'{dirpath}/{x.name}.jpg'
	if os.path.isfile(filename):
		print(f"Thumbnail already retrieved for {x.name}")
		return

	# Call file
	with open(filename, 'wb') as handle:
		print(f"Retrieving thumbnail for {x.name}")
		response = requests.get(x["thumbnail"], stream=True)

		# Fail request
		if not response.ok:
			print(f"Could not retrieve thumbnail for {x.name}")

		# Success save
		for block in response.iter_content(1024):
			if not block:
				break

			handle.write(block)

# Apply to all
df.apply(grab_thumbnail, axis=1)
del df

In [8]:
# Filter Images
files = [f for f in os.listdir(dirpath) if os.path.isfile(f"{dirpath}/{f}") and f.endswith(".jpg")]
image_ids = []
i = 0
for f in files:
	im = None
	try:
		im = Image.open(f"{dirpath}/{f}")

		if im.size != (120, 90):
			im.close()
			Path.unlink(f"{dirpath}/{f}")
			print(f"{f} deleted")
			continue

		im.close()
		im = None

		# Save valid indexes for filtering
		image_ids.append(f[:-4])
	except:
		# Close bad files
		if im is not None:
			im.close()
			im = None

		# Delete Bad Files
		Path.unlink(f"{dirpath}/{f}")
		print(f"{f} deleted")

	i += 1

02v-CVttnS0.jpg deleted
0EZUP5Vtemw.jpg deleted
4KlB4i4dEWU.jpg deleted
6JhUQpe-J6U.jpg deleted
bAHQy0QFUMI.jpg deleted
DcejDtVA4MU.jpg deleted
E0Hchyxwr4c.jpg deleted
ffLdLgSbpEc.jpg deleted
hstJLLvhYSM.jpg deleted
htE2M7shdfI.jpg deleted
JTwsU2dDpEg.jpg deleted
Lpnw6hMIu24.jpg deleted
Q9D-aQzRuU4.jpg deleted
RPoQZ_926hQ.jpg deleted
Sl2ueV8kRRU.jpg deleted
StkNJFSGksg.jpg deleted
tnAYVF1-q74.jpg deleted
VDg_U-n3t-I.jpg deleted
X82cgnMGeD8.jpg deleted
XO6KolPTH8U.jpg deleted
XY6Iw4kTOEI.jpg deleted
yQKNzY4HGGg.jpg deleted
ZBbw3WfcxN8.jpg deleted


#### Data Processing
This process involves text processing and image processing. This will involve text standardization and vectorization. For the image, it needs to be processed and normalized.

In [22]:
raw_data = pd.read_csv(datafile, index_col="yt-id")

# Filter raw data for thumbnail only entries
thumbnail_ids = np.array([f[:-4] for f in os.listdir(dirpath) if os.path.isfile(f"{dirpath}/{f}") and f.endswith(".jpg")], dtype=str)
raw_data = raw_data.loc[thumbnail_ids]

raw_data.describe()

Unnamed: 0,thumbnail-w,thumbnail-h,view-count,like-count,comment-count
count,35734.0,35734.0,35276.0,34289.0,35006.0
mean,120.0,90.0,92872.59,2304.083,115.992858
std,0.0,0.0,1687911.0,31173.86,1640.940831
min,120.0,90.0,0.0,0.0,0.0
25%,120.0,90.0,30.0,1.0,0.0
50%,120.0,90.0,281.5,8.0,1.0
75%,120.0,90.0,3153.25,75.0,12.0
max,120.0,90.0,223299600.0,2686147.0,146332.0


In [65]:
# Text Processing
def text_standardization(raw_strs):
	t = tf.strings.lower(raw_strs)
	t = tf.strings.regex_replace(t, emoji_re, "")
	t = tf.strings.regex_replace(t, sw_re, "")
	t = tf.strings.regex_replace(t, punc_re, "")
	t = tf.strings.regex_replace(t, space_re, " ")
	return t

# Tokenizer
tokenizer_layer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer_layer.fit_on_texts(raw_data["title"])
sequences = tokenizer_layer.texts_to_sequences(raw_data["title"])
padded_sequences = pad_sequences(sequences, maxlen=10, padding='post')

# Input 
input_texts = tokenizer(list(raw_data["title"]), padding=True, truncation=True, max_length=10, return_tensors="tf")
print(input_texts.keys())
# input_texts.pop("token_type_ids")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [73]:
input_texts["input_ids"]

<tf.Tensor: shape=(35734, 10), dtype=int32, numpy=
array([[  101, 22159,  5886, ...,  3995,  5104,   102],
       [  101, 25993,  1998, ...,  1010,  2208,   102],
       [  101, 22563,  2099, ...,  2401,  1011,   102],
       ...,
       [  101,  2162,  8505, ...,  1027,  3704,   102],
       [  101,   100,   100, ...,     0,     0,     0],
       [  101, 14154, 11563, ...,  4684,  2112,   102]], dtype=int32)>

In [23]:
# Label Processing
scores = raw_data["view-count"] # Grab View Count
scores = scores.fillna(0.0) # Replace NaN with 0
scores = scores.map(lambda x : np.log10(x + 1)) # Log everything to make it less extreme
scores = scores.div(np.log10(MAX_VIEWS + 1)) # Normalized (+1 to prevent one)

scores.describe()

count    35734.000000
mean         0.251066
std          0.141486
min          0.000000
25%          0.142091
50%          0.237930
75%          0.341665
max          0.819749
Name: view-count, dtype: float64

In [24]:
# Boolean Label
b_scores = scores.map(lambda x : int(x >= THRESHOLD))

b_scores.describe()

count    35734.000000
mean         0.158281
std          0.365009
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: view-count, dtype: float64

##### Model

Commonly, the Sequential API is used to train a model. However, due to the need for more than one input, the Functional API must be employed.

In [29]:
# Image Portion
img_input = Input((90, 120, 3))
x = Conv2D(32, 5, activation='relu', padding='same')(img_input)
x = MaxPooling2D()(x)
x = Conv2D(64, 3, activation='relu', padding='same')(x)
x = MaxPooling2D()(x)
x = Conv2D(128, 3, activation='relu', padding='same')(x)
x = MaxPooling2D()(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x_out = Dropout(0.5)(x)
x_out = Dense(64, activation='relu')(x_out)
img_output = Dense(1, activation='sigmoid')(x_out)

img_model = Model(inputs=img_input, outputs=img_output, name="img_model")

img_model.summary()

Model: "img_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 90, 120, 3)]      0         
                                                                 
 conv2d_3 (Conv2D)           (None, 90, 120, 32)       2432      
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 45, 60, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_4 (Conv2D)           (None, 45, 60, 64)        18496     
                                                                 
 max_pooling2d_4 (MaxPoolin  (None, 22, 30, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_5 (Conv2D)           (None, 22, 30, 128)       73

In [78]:
# Text Portion
text_input = Input(shape=(10,), dtype=tf.int32, name="text_inputs")
attention_mask = Input(shape=(10,), dtype=tf.int32, name="attention_masks")

transformer_output = transformer_model(text_input, attention_mask=attention_mask)
y = transformer_output.last_hidden_state[:, 0, :]

y = Dense(128, activation='relu')(y)
y_out = Dropout(0.5)(y)
text_output = Dense(1, activation='sigmoid')(y_out)

text_model = Model(inputs=[text_input, attention_mask], outputs=text_output, name="text_model")

text_model.summary()

Model: "text_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text_inputs (InputLayer)    [(None, 10)]                 0         []                            
                                                                                                  
 attention_masks (InputLaye  [(None, 10)]                 0         []                            
 r)                                                                                               
                                                                                                  
 tf_bert_model_4 (TFBertMod  TFBaseModelOutputWithPooli   1094822   ['text_inputs[0][0]',         
 el)                         ngAndCrossAttentions(last_   40         'attention_masks[0][0]']     
                             hidden_state=(None, 10, 76                                  

In [37]:
# United Model
z = Concatenate()([x, y])
z = Dropout(0.5)(z)
z = Dense(64, activation='relu')(z)
z = Dense(1, activation='sigmoid')(z)

united_model = Model(inputs=[img_input, text_input, attention_mask], outputs=z, name="unitied_model")

united_model.summary()

Model: "unitied_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 90, 120, 3)]         0         []                            
                                                                                                  
 conv2d_3 (Conv2D)           (None, 90, 120, 32)          2432      ['input_2[0][0]']             
                                                                                                  
 max_pooling2d_3 (MaxPoolin  (None, 45, 60, 32)           0         ['conv2d_3[0][0]']            
 g2D)                                                                                             
                                                                                                  
 conv2d_4 (Conv2D)           (None, 45, 60, 64)           18496     ['max_pooling2d_3[

#### Training
Using k-fold cross validation, we can judge the accuarcy of this model. To start, we need to make a generator class to batch the data.

In [34]:
class ThumbnailDataGenerator(Sequence):

	def __init__(self, filedir : str, list_IDs : list[str], labels : dict[str, float], rescale : float=255.0, filetype : str="jpg", batch_size : int=32, dim : tuple[int, int]=(90, 120), shuffle=True, **kwargs):
		'''
		Data Generator Initialization Function 
		'''
		self.filedir = filedir
		self.filetype = filetype
		self.dim = dim
		self.rescale = rescale
		self.batch_size = batch_size
		self.labels = labels
		self.list_IDs = list_IDs
		self.shuffle = shuffle
		self.on_epoch_end()

	def on_epoch_end(self):
		'''
		Updates indexes after each epoch
		'''
		self.indexes = np.arange(len(self.list_IDs))

		# Randomize if Shuffle
		if self.shuffle:
			np.random.shuffle(self.indexes)

	def __data_generation(self, list_IDs_temp):
		'''
		Generates data containing batch_size samples
		'''
		# Initialization
		X = np.empty((self.batch_size, *self.dim, 3))
		y = np.empty((self.batch_size), dtype=float)

		# Generate data
		for i, ID in enumerate(list_IDs_temp):
			# Store sample
			X[i,] = Image.open(f'{self.filedir}/{ID}.{self.filetype}')

			# Store class
			y[i] = self.labels[ID]

		# Rescale
		X /= self.rescale

		return X, y
	
	def __len__(self):
		'''
		Denotes the number of batches per epoch
		'''
		return int(np.floor(len(self.list_IDs) / self.batch_size))
	
	def __getitem__(self, index):
		'''
		Generate one batch of data
		'''
		# Generate indexes of the batch
		indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

		# Find list of IDs
		list_IDs_temp = [self.list_IDs[k] for k in indexes]

		# Generate data
		return self.__data_generation(list_IDs_temp)

The image-only model will undergo a k-fold cross validation.

In [77]:
# Image Model
kf = KFold(n_folds)

validation_accuracy = []
validation_loss = []

fold_var = 1
for train, val in kf.split(thumbnail_ids, b_scores):
	# Fold Indicator
	print(f"Starting k-Fold #{fold_var}")

	# Make image model for testing
	img_model = Model(inputs=img_input, outputs=img_output, name="img_model")
	img_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'Precision', 'Recall'])

	# Callback Saving
	checkpoint = ModelCheckpoint(f"{modeldir}/model_{fold_var}.h5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

	# Generator
	tbdg_train = ThumbnailDataGenerator(dirpath, thumbnail_ids[train], b_scores.iloc[train], batch_size=batch_size)
	tbdg_validate = ThumbnailDataGenerator(dirpath, thumbnail_ids[val], b_scores.iloc[val], batch_size=batch_size)

	# Fit
	history = img_model.fit(x=tbdg_train, validation_data=tbdg_validate, callbacks=[checkpoint], epochs=epochs)

	# Grab Results
	img_model.load_weights(f"{modeldir}/model_{fold_var}.h5")
	
	results = img_model.evaluate(x=tbdg_validate)
	results = dict(zip(img_model.metrics_names, results))
	
	validation_accuracy.append(results['accuracy'])
	validation_loss.append(results['loss'])
	
	# Clear
	clear_session()

	# Increment
	fold_var += 1

Starting k-Fold #1
Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.85532, saving model to models\model_1.keras
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.85532
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.85532
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.85532
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.85532
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.85532
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.85532
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.85532
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.85532
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.85532
Starting k-Fold #2
Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.85980, saving model to models\model_2.keras
Epoch 2/10
Epoch 2: val_accuracy improved from 0.85980 to 0.85994, saving model to models\model_2.keras
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.85994
Epoch 4/10
Epoch 4: val_accuracy d

The text only model will undergo the same test.

In [1]:
# Image Model
kf = KFold(n_folds)

validation_accuracy = []
validation_loss = []

fold_var = 1
for train, val in kf.split(input_texts["input_ids"], b_scores):
	# Fold Indicator
	print(f"Starting k-Fold #{fold_var}")

	# Make image model for testing
	text_model = Model(inputs=[text_input, attention_mask], outputs=text_output, name="text_model")
	text_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'Precision', 'Recall'])

	# Callback Saving
	checkpoint = ModelCheckpoint(f"{modeldir}/model_{fold_var}.keras", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

	# Train / Validation
	trainset = {
		"text_inputs": tf.gather(input_texts["input_ids"], indices=train), 
		"attention_masks": tf.gather(input_texts["attention_mask"], indices=train)
	}
	testset = {
		"text_inputs": tf.gather(input_texts["input_ids"], indices=val), 
		"attention_masks": tf.gather(input_texts["attention_mask"], indices=val)
	}

	# Fit
	history = text_model.fit(x=trainset, y=b_scores.iloc[train], validation_data=(testset, b_scores.iloc[val]), callbacks=[checkpoint], epochs=epochs)

	# Grab Results
	text_model.load_weights(f"{modeldir}/model_{fold_var}.h5")
	
	results = text_model.evaluate(x=testset, y=b_scores.iloc[val])
	results = dict(zip(text_model.metrics_names, results))
	
	validation_accuracy.append(results['accuracy'])
	validation_loss.append(results['loss'])
	
	# Clear
	clear_session()

	# Increment
	fold_var += 1

NameError: name 'KFold' is not defined

#### Testing
Once the k-fold cross validation is complete, the final model can be trained with all the data and can be tested with complete new data from the API.