# Classyfing YouTube videos using tags

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
from tensorflow import keras
import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

from tag_handler import *

In [None]:
from cvs_reader import *
from vocab import *
from dataframe_creator import create_seperate_columns, create_listed_columns

# Get dataframes with tags and category_id as columns
ca_csvpath = 'data/CAvideos.csv'
us_csvpath = 'data/USvideos.csv'
gb_csvpath = 'data/USvideos.csv'

youtube_dfs = [load_dataframe(ca_csvpath, ['tags', 'category_id'])]
youtube_dfs += [load_dataframe(us_csvpath, ['tags', 'category_id'])]
youtube_dfs += [load_dataframe(gb_csvpath, ['tags', 'category_id'])]

# Concat three dataframes into one
tab_dataframe = pd.concat(youtube_dfs, ignore_index=True)
tab_dataframe = tab_dataframe.dropna()
tab_dataframe.head(100)

In [None]:
# Remove 'none' values from dataframe
junk = tab_dataframe[tab_dataframe['tags']=='[none]']
tab_dataframe = pd.concat([tab_dataframe, junk, junk]).drop_duplicates(keep=False)
tab_dataframe

In [None]:
# Spliting dataframe into several dfs based on category_id
dfs = [x for _, x in tab_dataframe.groupby('category_id')]

# deleting 3 categories (29, 30, 43) beacause of lack of frequency
del dfs[14:17]

In [None]:
from sklearn.utils import resample

resampled_dfs = []
for df in dfs:
    resampled_dfs += [resample(df, n_samples=4000)]

# Resampling datas to make them balanced
tab_dataframe = pd.concat(resampled_dfs, axis=0, ignore_index=True)
tab_dataframe

In [None]:
max_of_tags = 25

vocab = get_tags_vocab(tab_dataframe['tags'])
tab_dataframe = create_seperate_columns(tab_dataframe, max_of_tags, 'tags')
tab_dataframe

In [None]:
# Convert vocab to dictionary
voc_di = get_tags_vocab_as_dict(vocab)

# Mapping tags to indexes in vocab
for col in tab_dataframe.columns:
    if col == 'category_id':
        continue
    tab_dataframe[col] = tab_dataframe[col].map(voc_di)

tab_dataframe.fillna(0, inplace = True) 
tab_dataframe.head()

In [None]:
future_df = []
col_row_dict = {}
# We only use 1/4 of the dataset!
first_half_df = tab_dataframe[:10000]

In [None]:
for i in range(0, 10000):
    for col in first_half_df.columns:
        if col == 'category_id':
            col_row_dict.update({'category_id':first_half_df[col][i]})
        else:
            col_row_dict.update({'tag_{}'.format(first_half_df[col][i]):1.0})
    future_df.append(col_row_dict)
    col_row_dict = {}

In [None]:
final_dataframe = pd.DataFrame(future_df)
final_dataframe.fillna(0.0, inplace = True)
from input_generator import load_dataset_with_lables
final_dataframe.head()

In [None]:
from input_generator import load_dataset_with_lables


# Create train,validation and test examples from tags and labels dataframe
(train_ds, train_lb), (val_ds, val_lb), (test_ds, test_lb) = load_dataset_with_lables(final_dataframe, 'category_id')

In [None]:
train_ds.values

In [None]:
train_lb.values

In [None]:
hidden = 128
model = tf.keras.Sequential([
    layers.Flatten(),
    layers.Dense(hidden,activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(hidden,activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    layers.Dropout(0.5),
    layers.Dense(2*hidden,activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    layers.Dropout(0.5),
    layers.Dense(44, activation='softmax')])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'],
              )

history = model.fit(train_ds.values, train_lb.values,
                    batch_size=32,
                    epochs=6,
                    validation_data=(val_ds.values, val_lb.values))


In [None]:
import matplotlib.pyplot as plt

history_dict = history.history
acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

plt.plot(acc)
plt.plot(val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(loss)
plt.plot(val_loss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
results = model.evaluate(test_ds, test_lb)

print(results)

In [None]:
predictions = model.predict(test_ds)
cat_tit_dict = get_category_title_dict('/content/CA_category_id.json')


In [None]:
def one_hot_to_index(one_hot):
    i_tags = []
    for i, b in enumerate(one_hot):
        if b == 1.0:
        i_tags.append(i)      
    return i_tags

def index_to_tag(i_tags):
w_tags = []
reversed_vocab_dict = {i: vo for vo, i in voc_di.items()}
for index in i_tags:
    w_tags.append(reversed_vocab_dict[index])
return w_tags


In [None]:
for i in range(30, 40):
    one_hot = one_hot_to_index(test_ds.values[i])
    tags_i = index_to_tag(one_hot)
    print(tags_i)
    print('predict:{}'.format(cat_tit_dict[str(np.argmax(predictions[i]))]))
    print('true label:{}'.format(cat_tit_dict[str(test_lb.values[i])]))

# Classifying using tags, number of likes and number of comments


In [None]:
full_dataframe = load_dataframe(csvpath, ['tags', 'likes', 'comment_count', 'category_id'])
max_of_tags = 25

full_dataframe = create_seperate_columns(full_dataframe, max_of_tags, 'tags')

full_dataframe.head()