# Classyfing YouTube videos using tags, number of likes, comments and views


In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals


import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
from cvs_reader import load_dataframe
from dataframe_creator import create_seperate_columns

# Get dataframes with tags and category_id as columns
ca_csvpath = 'data/CAvideos.csv'
us_csvpath = 'data/USvideos.csv'
gb_csvpath = 'data/GBvideos.csv'

youtube_dfs = [load_dataframe(ca_csvpath, ['tags', 'category_id', 'likes', 'views', 'comment_count'])]
youtube_dfs += [load_dataframe(us_csvpath, ['tags', 'category_id', 'likes', 'views', 'comment_count'])]
youtube_dfs += [load_dataframe(gb_csvpath, ['tags', 'category_id', 'likes', 'views', 'comment_count'])]

# Concat three dataframes into one
youtube_df = pd.concat(youtube_dfs, ignore_index=True)
youtube_df = youtube_df.dropna()
youtube_df.head

In [3]:
# Remove 'none' and duplicated values from dataframe
youtube_df = youtube_df[youtube_df['tags']!='[none]']
youtube_df = youtube_df.drop_duplicates(subset='tags')
youtube_df.head

<bound method NDFrame.head of                                                      tags  category_id  \
0       Eminem|Walk|On|Water|Aftermath/Shady/Interscop...           10   
1       plush|bad unboxing|unboxing|fan mail|idubbbztv...           23   
2       racist superman|rudy|mancuso|king|bach|racist|...           23   
3       ryan|higa|higatv|nigahiga|i dare you|idy|rhpc|...           24   
4       edsheeran|ed sheeran|acoustic|live|cover|offic...           10   
...                                                   ...          ...   
120589  5SOS|5 Seconds of Summer|No Roots|Cover|BBC|Ra...           10   
120593  Fox News Channel|FNC|Fox News|News|Latest News...           25   
120606  capitalfmofficial|capital|capital fm|capital r...           10   
120623  The Chainsmokers Somebody|Chainsmokers Somebod...           10   
120717  5-Minute Crafts|DIY|Do it yourself|crafts|truc...           26   

          likes     views  comment_count  
0        787425  17158579         1258

In [4]:
# delete some categories because of lack of data
to_delete = [2, 15, 19, 29, 30, 43]
for cat in to_delete:
    youtube_df = youtube_df[youtube_df.category_id != cat]
youtube_df.head()

Unnamed: 0,tags,category_id,likes,views,comment_count
0,Eminem|Walk|On|Water|Aftermath/Shady/Interscop...,10,787425,17158579,125882
1,plush|bad unboxing|unboxing|fan mail|idubbbztv...,23,127794,1014651,13030
2,racist superman|rudy|mancuso|king|bach|racist|...,23,146035,3191434,8181
3,ryan|higa|higatv|nigahiga|i dare you|idy|rhpc|...,24,132239,2095828,17518
4,edsheeran|ed sheeran|acoustic|live|cover|offic...,10,1634130,33523622,85067


In [5]:
from sklearn.utils import resample


# Spliting dataframe into several dfs based on category_id
dfs = [x for _, x in youtube_df.groupby('category_id')]

# getting 2000 samples from each category
resampled_dfs = []
for df in dfs:
    if df.shape[0] < 2000:
        resampled_dfs += [resample(df, n_samples=2000)]
    else:
        resampled_dfs += [df.sample(n=2000)]
    
youtube_df = pd.concat(resampled_dfs, axis=0, ignore_index=True)
youtube_df.head()

Unnamed: 0,tags,category_id,likes,views,comment_count
0,Skate|Competition|I Tonya|I Tonya trailer|I To...,1,8010,750287,320
1,ue megablast|ue|megablast|ultimate ears|blueto...,1,27761,949272,2265
2,what|ever|happened|to|baby|jane|bette|davis|jo...,1,461,159090,118
3,電視劇|大陸電視劇|风筝|谍战|柳云龙|罗海琼|李小冉|马驰|liuyunlong|luoh...,1,20,19481,10
4,Blockbuster|blockbuster.dk|blockbuster.se|bloc...,1,40,12609,17


In [6]:
# mapping remaining category numbers to ordered integers
new_categories_dict = {1:1, 10:2, 17:3, 20:4, 22:5, 23:6, 24:7, 25:8, 26:9, 27:10, 28:11}
reversed_new_categories_dict = {value:key for key, value in new_categories_dict.items()}
categories = youtube_df['category_id']
youtube_df['category_id'] = categories.map(new_categories_dict)
youtube_df.head()

Unnamed: 0,tags,category_id,likes,views,comment_count
0,Skate|Competition|I Tonya|I Tonya trailer|I To...,1,8010,750287,320
1,ue megablast|ue|megablast|ultimate ears|blueto...,1,27761,949272,2265
2,what|ever|happened|to|baby|jane|bette|davis|jo...,1,461,159090,118
3,電視劇|大陸電視劇|风筝|谍战|柳云龙|罗海琼|李小冉|马驰|liuyunlong|luoh...,1,20,19481,10
4,Blockbuster|blockbuster.dk|blockbuster.se|bloc...,1,40,12609,17


In [7]:
from vocab_handler import get_tags_frequency, get_tags_vocab, get_tags_vocab_as_dict


tag_frequency = get_tags_frequency(youtube_df['tags']) # returns tag as key and tag frequency as value
lists = sorted(tag_frequency.items())

lists[:10]

[('', 5),
 (' 070118 💖', 1),
 (' 080118 💖', 0),
 (' 100% Will Get Satisfied', 0),
 (' 101217 💚', 1),
 (' 140118 💖', 0),
 (' 150118 💖', 2),
 (' 2018', 1),
 (' 210118 💖', 1),
 (' 220118 💖', 1)]

In [8]:
pruned_vocab = get_tags_vocab(tag_frequency, 10) #Returns tags vocab
vocab_di = get_tags_vocab_as_dict(pruned_vocab)

In [9]:
from sklearn.preprocessing import StandardScaler


# Perfroming normalization on integer datas
youtube_df['average'] = youtube_df[['likes', 'comment_count', 'views']].mean(axis=1)
scaler = StandardScaler()


youtube_df[['likes', 'comment_count', 'views', 'average']] = scaler.fit_transform(youtube_df[['likes', 'comment_count', 'views', 'average']])


In [10]:
max_of_tags = 96
 
# Create a dataframe by spliting each tag by '|'
youtube_df = create_seperate_columns(youtube_df, max_of_tags, 'tags')
youtube_df

Unnamed: 0,category_id,likes,views,comment_count,average,tag_0,tag_1,tag_2,tag_3,tag_4,...,tag_86,tag_87,tag_88,tag_89,tag_90,tag_91,tag_92,tag_93,tag_94,tag_95
0,1,-0.212267,0.110252,-0.256435,0.097961,Skate,Competition,I Tonya,I Tonya trailer,I Tonya movie,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags
1,1,0.097033,0.226595,-0.062794,0.223328,ue megablast,ue,megablast,ultimate ears,bluetooth speaker,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags
2,1,-0.330484,-0.235409,-0.276546,-0.242297,what,ever,happened,to,baby,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags
3,1,-0.337390,-0.317036,-0.287298,-0.321919,電視劇,大陸電視劇,风筝,谍战,柳云龙,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags
4,1,-0.337077,-0.321054,-0.286602,-0.325808,Blockbuster,blockbuster.dk,blockbuster.se,blockbuster.no,blockbuster.fi,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags
5,1,-0.313838,-0.218203,-0.270373,-0.224940,Cinema,Trailer,Official,Movie,Film,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags
6,1,-0.302970,-0.279275,-0.153890,-0.283220,Yasmine Amari,The voice,Mohamed Reghis,Cheb Nesro,Kader Japonais,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags
7,1,-0.142486,-0.153724,-0.221391,-0.155795,Rooster Teeth,RT,animation,television,filmmaking,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags
8,1,0.434160,0.276214,1.105526,0.290435,essential phone,essential,phone,smartphone,android,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags
9,1,-0.230245,-0.233224,-0.207353,-0.236142,howtocookthat,stop motion animation,stop motion challenge,stop motion video,challenge,...,notags,notags,notags,notags,notags,notags,notags,notags,notags,notags


In [11]:
# One hot encoding tags
def one_hot(df, non_one_hot_columns):
    future_df = []
    col_row_dict = {}
    
    for i in range(0, 22000):
        for col in df.columns:
            if col in non_one_hot_columns:
                col_row_dict[col] = df[col][i]
            elif df[col][i] in pruned_vocab:
                    col_row_dict[df[col][i]] = 1.0
        future_df.append(col_row_dict)
        col_row_dict = {}
        
    return future_df

In [12]:
one_hot_list = one_hot(youtube_df, ['category_id', 'comment_count', 'likes', 'views', 'average'])
youtube_df = pd.DataFrame(one_hot_list)
youtube_df.fillna(0.0, inplace = True)
youtube_df

Unnamed: 0,category_id,likes,views,comment_count,average,trailer,2017,figure skating,sport,championship,...,scishow,complexly,fossils,natural history,waterjet channel,water jet cutting,waterjet,water cutting,water cutter,cutting with water
0,1,-0.212267,0.110252,-0.256435,0.097961,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.097033,0.226595,-0.062794,0.223328,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,-0.330484,-0.235409,-0.276546,-0.242297,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,-0.337390,-0.317036,-0.287298,-0.321919,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,-0.337077,-0.321054,-0.286602,-0.325808,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,-0.313838,-0.218203,-0.270373,-0.224940,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,-0.302970,-0.279275,-0.153890,-0.283220,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,-0.142486,-0.153724,-0.221391,-0.155795,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,0.434160,0.276214,1.105526,0.290435,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1,-0.230245,-0.233224,-0.207353,-0.236142,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from input_generator import generate_dataset


# Creating train,validation and test examples from tags and labels dataframe
(train_ds, train_lb), (val_ds, val_lb), (test_ds, test_lb) = generate_dataset(youtube_df, 'category_id')

In [None]:
hidden = 200
model = tf.keras.Sequential([
    layers.Dense(200,activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(300,activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(300,activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(150,activation='relu'),
    layers.Dense(100,activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(12, activation='softmax')])

nadam = keras.optimizers.Nadam(lr=0.00001)

model.compile(optimizer=nadam,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'],
              )

history = model.fit(train_ds.values, train_lb.values,
                    batch_size=32,
                    epochs=100,
                    validation_data=(val_ds.values, val_lb.values))


W0825 16:03:25.754539  7756 nn_ops.py:4224] Large dropout rate: 0.6 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W0825 16:03:25.840600  7756 nn_ops.py:4224] Large dropout rate: 0.6 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Train on 13200 samples, validate on 4400 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500


Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
  160/13200 [..............................] - ETA: 14s - loss: 1.0669 - acc: 0.6375

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

history_dict = history.history
acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

plt.plot(acc)
plt.plot(val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(loss)
plt.plot(val_loss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
results = model.evaluate(test_ds, test_lb)

print(results)

In [None]:
from sklearn.metrics import classification_report

predictions = model.predict(test_ds)
print(classification_report(test_lb, np.argmax(predictions, axis=1)))

In [None]:
from tag_handler import get_category_title_dict


cat_tit_dict = get_category_title_dict('data/json/CA_category_id.json')
cat_tit_dict.update(get_category_title_dict('data/json/US_category_id.json'))
cat_tit_dict.update(get_category_title_dict('data/json/GB_category_id.json'))

In [None]:
for i in range(30, 40):
    y_pred = reversed_new_categories_dict[np.argmax(predictions[i])]
    label = reversed_new_categories_dict[test_lb.values[i]]
    print('""""""""""""""""""""""""""""""""""""""""""""""""')
    print('predict:{}'.format(cat_tit_dict[str(y_pred)]))
    print('true label:{}'.format(cat_tit_dict[str(label)]))