In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
from tensorflow import keras
import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split


from input_generator import load_dataset, load_dataset_with_lables
from data_parser import get_tags_and_labels, get_vocab, get_tags

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Get dataframe with tags and category_id columns
csvpath = 'data/CAvideos.cvs'
tab_dataframe = get_tags_and_labels(csvpath)

# Creating different coloumn based on different tags
new = tab_dataframe["tags"].str.split(".", expand = True)

max_of_tags = 124

for i in range(0, max_of_tags):
    name = "tags"+str(i)
    tab_dataframe[name] = new[i]

# Dropping old tags columns 
tab_dataframe.drop(columns =["tags"], inplace = True) 
tab_dataframe.fillna("notags", inplace = True) 


tab_dataframe.head()

Unnamed: 0,category_id,tags0,tags1,tags2,tags3,tags4,tags5,tags6,tags7,tags8,tags9,tags10,tags11,tags12,tags13,tags14
0,10,Eminem,Walk,On,Water,Aftermath/Shady/Interscope,Rap,notags,notags,notags,notags,notags,notags,notags,notags,notags
1,23,plush,bad unboxing,unboxing,fan mail,idubbbztv,idubbbztv2,things,best,packages,plushies,chontent chop,notags,notags,notags,notags
2,23,racist superman,rudy,mancuso,king,bach,racist,superman,love,rudy mancuso poo bear black white official mus...,iphone x by pineapple,lelepons,hannahstocking,rudymancuso,inanna,anwar
3,24,ryan,higa,higatv,nigahiga,i dare you,idy,rhpc,dares,no truth,comments,comedy,funny,stupid,fail,notags
4,10,edsheeran,ed sheeran,acoustic,live,cover,official,remix,official video,lyrics,session,notags,notags,notags,notags,notags


In [3]:
# Getting leaning vocab
vocab = get_vocab(get_tags(csvpath))
vocab_size = len(vocab)

In [4]:
voc_df = pd.DataFrame(columns=np.arange(vocab_size+1))

voc_df.rename(columns={i:'tags{}'.format(i-1) for i in range(1, vocab_size)},
                 inplace=True)
voc_df.rename(columns={0:'category_id'},
                 inplace=True)

voc_df.head()

Unnamed: 0,category_id,tags0,tags1,tags2,tags3,tags4,tags5,tags6,tags7,tags8,...,tags151503,tags151504,tags151505,tags151506,tags151507,tags151508,tags151509,tags151510,tags151511,151513


In [5]:
voc_di = {i: vo for i, vo in enumerate(vocab)}
voc_di[0] = 'notags'
voc_di = {vo: i for i, vo in voc_di.items()}

# Mapping tags to indexes in vocab
for index in range(0, max_of_tags):
    tab_dataframe['tags{}'.format(index)] = tab_dataframe['tags{}'.format(index)].map(voc_di)

tab_dataframe.fillna(0.0, inplace = True) 
tab_dataframe.head()


Unnamed: 0,category_id,tags0,tags1,tags2,tags3,tags4,tags5,tags6,tags7,tags8,tags9,tags10,tags11,tags12,tags13,tags14
0,10,145858.0,41075.0,144054.0,113739.0,29048.0,136795.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23,132021.0,7417.0,140429.0,2762.0,91161.0,59128.0,136918.0,116578.0,139083.0,148058.0,68178.0,0.0,0.0,0.0,0.0
2,23,144322.0,39265.0,137958.0,86062.0,67686.0,114176.0,12098.0,23443.0,7918.0,120483.0,55703.0,108335.0,95213.0,56693.0,11206.0
3,24,79531.0,146181.0,38066.0,105620.0,22353.0,66285.0,18847.0,47640.0,43869.0,142624.0,71912.0,112521.0,51266.0,144611.0,0.0
4,10,55070.0,142371.0,38037.0,32486.0,18041.0,147534.0,72940.0,69098.0,133560.0,85182.0,0.0,0.0,0.0,0.0,0.0


In [None]:
for row in tab_dataframe.rows:
    for data in row:
        

In [None]:
# Create train,validation and test examples from dataframe
# train_ds, val_ds, test_ds = load_dataset(tab_dataframe)
(train_ds, train_lb), (val_ds, val_lb), (test_ds, test_lb) = load_dataset_with_lables(tab_dataframe, 'category_id')


In [None]:
# first 5 training examples
print(train_ds.values[5:10])
train_lb.values[0:5]

In [None]:
hidden = 32
model = tf.keras.Sequential([
    layers.Flatten(input_shape=(max_of_tags,)),
    layers.Dense(hidden,activation='relu'),
    layers.Dropout(0.7),
    layers.Dense(2*hidden,activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.7),
    layers.Dense(44, activation='softmax', kernel_regularizer=keras.regularizers.l2(0.01))])

model.compile(optimizer='nadam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy', 'sparse_categorical_crossentropy'],
              )

history = model.fit(train_ds.values, train_lb.values,
                    batch_size=64,
                    epochs=100,
                    validation_data=(val_ds, val_lb))
