# Combined Model

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Input,Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers import Conv2D, Dropout, Concatenate,Average, BatchNormalization, MaxPooling2D, Flatten
from keras.utils import plot_model

from keras.callbacks import EarlyStopping

import h5py
from keras.models import load_model
from keras.models import Model

import pickle
import time
import datetime

FIXED_LENGTH_FOR_TRAINING = 15000

Using TensorFlow backend.


In [None]:
# MODEL_NAMES = ['Audio_Transcript','Title','Tags','Comments','Thumbnail','Video']

## Load the model

In [6]:
MERGED_model = load_model('final_model/combined_model.h5')
AT_tokenizer = pickle.load(open('final_model/audio_transcript_tokenizer.pkl', 'rb') )
T_tokenizer = pickle.load(open('final_model/title_tokenizer.pkl', 'rb') )
TAGS_tokenizer = pickle.load(open('final_model/tags_tokenizer.pkl', 'rb') )
C_tokenizer = pickle.load(open('final_model/comments_tokenizer.pkl', 'rb') )

STATS_labelencoder = pickle.load(open('final_model/stats_labelencoder.pkl', 'rb') )
STATS_sc = pickle.load(open('final_model/stats_scaler.pkl', 'rb') )

In [7]:
MERGED_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
TB_IN (InputLayer)              (None, 90, 120, 3)   0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 90, 120, 64)  1792        TB_IN[0][0]                      
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 90, 120, 64)  36928       conv2d_1[0][0]                   
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 45, 60, 64)   0           conv2d_2[0][0]                   
__________________________________________________________________________________________________
conv2d_3 (

Load Audio Transcript Layers

In [None]:
MERGED_model.get_layer('embedding_1').set_weights(np.load('audio_transcript/embedding.npy', allow_pickle = True))
MERGED_model.get_layer('AT_LSTM').set_weights(np.load('audio_transcript/lstm.npy', allow_pickle = True))
MERGED_model.get_layer('AT_OUT').set_weights(np.load('audio_transcript/dense.npy', allow_pickle = True))

Load Title Layers

In [None]:
MERGED_model.get_layer('embedding_2').set_weights(np.load('title/embedding.npy', allow_pickle = True))
MERGED_model.get_layer('T_LSTM').set_weights(np.load('title/lstm.npy', allow_pickle = True))
MERGED_model.get_layer('T_OUT').set_weights(np.load('title/dense.npy', allow_pickle = True))

Load Tags Layers

In [None]:
MERGED_model.get_layer('embedding_3').set_weights(np.load('tags/embedding.npy', allow_pickle = True))
MERGED_model.get_layer('TAGS_LSTM').set_weights(np.load('title/lstm.npy', allow_pickle = True))
MERGED_model.get_layer('TAGS_OUT').set_weights(np.load('title/dense.npy', allow_pickle = True))

Load Comments Layers

In [None]:
MERGED_model.get_layer('embedding_4').set_weights(np.load('comments/embedding.npy', allow_pickle = True))
MERGED_model.get_layer('C_LSTM').set_weights(np.load('title/lstm.npy', allow_pickle = True))
MERGED_model.get_layer('C_OUT').set_weights(np.load('title/dense.npy', allow_pickle = True))

Load Stats Layers

In [None]:
MERGED_model.get_layer('STATS_1').set_weights(np.load('stats/dense_1.npy', allow_pickle = True))
MERGED_model.get_layer('STATS_2').set_weights(np.load('title/dense_2.npy', allow_pickle = True))
MERGED_model.get_layer('STATS_OUT').set_weights(np.load('title/dense_3.npy', allow_pickle = True))

Load Thumbnail Layers

In [None]:
MERGED_model.get_layer('conv2d_1').set_weights(np.load('comments/conv_1.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_2').set_weights(np.load('comments/conv_2.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_3').set_weights(np.load('comments/conv_3.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_4').set_weights(np.load('comments/conv_4.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_5').set_weights(np.load('comments/conv_5.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_6').set_weights(np.load('comments/conv_6.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_7').set_weights(np.load('comments/conv_7.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_8').set_weights(np.load('comments/conv_8.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_9').set_weights(np.load('comments/conv_9.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_10').set_weights(np.load('comments/conv_10.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_11').set_weights(np.load('comments/conv_11.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_12').set_weights(np.load('comments/conv_12.npy', allow_pickle = True))
MERGED_model.get_layer('conv2d_13').set_weights(np.load('comments/conv_13.npy', allow_pickle = True))


MERGED_model.get_layer('dense_1').set_weights(np.load('comments/dense_1.npy', allow_pickle = True))
MERGED_model.get_layer('dense_2').set_weights(np.load('comments/dense_2.npy', allow_pickle = True))
MERGED_model.get_layer('dense_3').set_weights(np.load('comments/dense_3.npy', allow_pickle = True))
MERGED_model.get_layer('TB_OUT').set_weights(np.load('comments/dense_4.npy', allow_pickle = True))

Load Video Layers

## Reading the files

In [None]:
df = pd.read_csv('final_model/Data.csv')
audio_data = pd.read_csv('audio_transcript/Data.csv')

In [None]:
df = df.iloc[:FIXED_LENGTH_FOR_TRAINING]
audio_data = audio_data.iloc[:FIXED_LENGTH_FOR_TRAINING]

In [None]:
IMGS_X_train = np.load('thumbnail/train_images.npy')
IMGS_y_train = np.load('thumbnail/train_labels.npy')

IMGS_X_test = np.load('thumbnail/test_images.npy')
IMGS_y_test = np.load('thumbnail/test_labels.npy')

# train_path = 'thumbnail/data/train'
# test_path = 'thumbnail/data/test'
# train_batches = ImageDataGenerator().flow_from_directory(train_path,target_size=(90, 120),classes=['cb','ncb'],batch_size = 32)
# test_batches = ImageDataGenerator().flow_from_directory(test_path,target_size=(90, 120),classes=['cb','ncb'],batch_size = 32)

# count = 0
# train_imgs,train_labels = next(train_batches)
# for batch in train_batches:
    
#     a,b = batch
#     train_imgs = np.concatenate((train_imgs , a), axis = 0)
#     train_labels = np.concatenate((train_labels , b), axis = 0)
#     count += 1
#     print(train_imgs.shape)
    
#     if (train_imgs.shape[0] > 13499):
#         break
# #     print(str(count))

# count = 0
# test_imgs,test_labels = next(test_batches)
# for batch in test_batches:
#     a,b = batch
#     test_imgs = np.concatenate((test_imgs , a), axis = 0)
#     test_labels = np.concatenate((test_labels , b), axis = 0)
#     count += 1
#     print(test_imgs.shape)

#     if (test_imgs.shape[0] > 1499):
#         break
        
# np.save('thumbnail/train_images.npy', train_imgs)
# np.save('thumbnail/train_labels.npy', train_labels)
# np.save('thumbnail/test_images.npy', test_imgs)
# np.save('thumbnail/test_labels.npy', test_labels)

## Data Preprocessing

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [None]:
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    if type(text) == float:
        text = str(text)
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
    # text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [None]:
def convert_to_timestamp(s):
    return time.mktime(datetime.datetime.strptime(s, "%Y-%m-%dT%H:%M:%S.000Z").timetuple())

#### Clean the text

In [None]:
audio_data = audio_data.reset_index(drop=True)
df = df.reset_index(drop=True)

In [None]:
# Audio Transcript
audio_data['Audio_Transcript'] = audio_data['Audio_Transcript'].apply(clean_text)
audio_data['Audio_Transcript'] = audio_data['Audio_Transcript'].str.replace('\d+', '')

In [None]:
# Title
df['Title'] = df['Title'].apply(clean_text)
df['Title'] = df['Title'].str.replace('\d+', '')

In [None]:
# Tags
df['Tags'] = df['Tags'].apply(clean_text)
df['Tags'] = df['Tags'].str.replace('\d+', '')

In [None]:
# Comments
df['Comments'] = df['Comments'].apply(clean_text)
df['Comments'] = df['Comments'].str.replace('\d+', '')

In [None]:
# Stats
df['Date'] = df['Date'].apply(convert_to_timestamp)
df['Duration'].replace('None', np.nan, inplace=True)

df['Date'].replace('PT1M', np.nan, inplace=True)
df['Date'].replace('PT2M', np.nan, inplace=True)
df['Date'].replace('PT3M', np.nan, inplace=True)
df['Date'].replace('PT4M', np.nan, inplace=True)
df['Date'].replace('PT5M', np.nan, inplace=True)
df['Date'].replace('PT6M', np.nan, inplace=True)
df['Date'].replace('PT7M', np.nan, inplace=True)
df['Date'].replace('PT8M', np.nan, inplace=True)
df['Date'].replace('PT9M', np.nan, inplace=True)
df['Date'].replace('PT10M', np.nan, inplace=True)
df['Date'].replace('PT11M', np.nan, inplace=True)
df['Date'].replace('PT12M', np.nan, inplace=True)
df['Date'].replace('PT13M', np.nan, inplace=True)

pd.to_numeric(df['Duration'], errors='coerce')

## Split to Train and Test Set

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# This is fixed.
EMBEDDING_DIM = 100

#### Audio Transcript

In [None]:
AT_MAX_SEQUENCE_LENGTH = 3000
AT_tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
AT_tokenizer.fit_on_texts(audio_data['Audio_Transcript'].values)
word_index = AT_tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))
AT_X = AT_tokenizer.texts_to_sequences(audio_data['Audio_Transcript'].values)
AT_X = pad_sequences(AT_X, maxlen=AT_MAX_SEQUENCE_LENGTH)
# print('Shape of data tensor:', AT_X.shape)
AT_Y = pd.get_dummies(audio_data['Label']).values
# print('Shape of label tensor:', AT_Y.shape)
AT_X_train, AT_X_test, AT_Y_train, AT_Y_test = train_test_split(AT_X,AT_Y, test_size = 0.10, random_state = 42)
print(AT_X_train.shape,AT_Y_train.shape)
print(AT_X_test.shape,AT_Y_test.shape)

#### Title

In [None]:
T_MAX_SEQUENCE_LENGTH = 3000
T_tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
T_tokenizer.fit_on_texts(df['Title'].values)
word_index = T_tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))
T_X = T_tokenizer.texts_to_sequences(df['Title'].values)
T_X = pad_sequences(T_X, maxlen=T_MAX_SEQUENCE_LENGTH)
# print('Shape of data tensor:', AT_X.shape)
Y = pd.get_dummies(df['Label']).values
# print('Shape of label tensor:', AT_Y.shape)
T_X_train, T_X_test, T_Y_train, T_Y_test = train_test_split(T_X,Y , test_size = 0.10, random_state = 42)
print(T_X_train.shape,T_Y_train.shape)
print(T_X_test.shape,T_Y_test.shape)

#### Tags

In [None]:
TAGS_MAX_SEQUENCE_LENGTH = 3000
TAGS_tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
TAGS_tokenizer.fit_on_texts(df['Tags'].values)
word_index = TAGS_tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))
TAGS_X = TAGS_tokenizer.texts_to_sequences(df['Tags'].values)
TAGS_X = pad_sequences(TAGS_X, maxlen=TAGS_MAX_SEQUENCE_LENGTH)
# print('Shape of data tensor:', AT_X.shape)
# T_Y = pd.get_dummies(df['Label']).values
# print('Shape of label tensor:', AT_Y.shape)
TAGS_X_train, TAGS_X_test, TAGS_Y_train, TAGS_Y_test = train_test_split(T_X,Y, test_size = 0.10, random_state = 42)
print(TAGS_X_train.shape,TAGS_Y_train.shape)
print(TAGS_X_test.shape,TAGS_Y_test.shape)

#### Comments

In [None]:
C_MAX_SEQUENCE_LENGTH = 3000
C_tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
C_tokenizer.fit_on_texts(df['Comments'].values)
word_index = C_tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))
C_X = C_tokenizer.texts_to_sequences(df['Comments'].values)
C_X = pad_sequences(C_X, maxlen=C_MAX_SEQUENCE_LENGTH)
# print('Shape of data tensor:', AT_X.shape)
# T_Y = pd.get_dummies(df['Label']).values
# print('Shape of label tensor:', AT_Y.shape)
C_X_train, C_X_test, C_Y_train, C_Y_test = train_test_split(C_X,Y, test_size = 0.10, random_state = 42)
print(C_X_train.shape,C_Y_train.shape)
print(C_X_test.shape,C_Y_test.shape)

#### Stats

In [None]:
STATS_X = df.iloc[:, 5:12].values
# STATS_y = df.iloc[:, 0].values

STATS_labelencoder = LabelEncoder()
STATS_X[:, 1] = STATS_labelencoder.fit_transform(STATS_X[:, 1])

STATS_X_train, STATS_X_test, STATS_Y_train, STATS_Y_test = train_test_split(STATS_X, Y, test_size = 0.10, random_state = 0)

STATS_SC = StandardScaler()
STATS_X_train = STATS_SC.fit_transform(STATS_X_train)
STATS_X_test = STATS_SC.transform(STATS_X_test)

#### Thumbnail



#### Video

## Building the model

#### Audio Transcript

In [None]:
AT_IN = Input(shape=(AT_MAX_SEQUENCE_LENGTH,), name='AT_IN')
AT_EMB = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=AT_X.shape[1], trainable = False) (AT_IN)
AT_SPD = SpatialDropout1D(0.5) (AT_EMB)
AT_LSTM = LSTM(100, dropout=0.5, recurrent_dropout=0.2, name="AT_LSTM", trainable = False) (AT_SPD)
AT_OUT = Dense(2, activation='softmax', name='AT_OUT') (AT_LSTM)

#### Title

In [None]:
T_IN = Input(shape=(T_MAX_SEQUENCE_LENGTH,), name='T_IN')
T_EMB = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=T_X.shape[1], trainable = False) (T_IN)
T_SPD = SpatialDropout1D(0.5) (T_EMB)
T_LSTM = LSTM(100, dropout=0.5, recurrent_dropout=0.2,name="T_LSTM", trainable = False) (T_SPD)
T_OUT = Dense(2, activation='softmax', name='T_OUT') (T_LSTM)

#### Tags

In [None]:
TAGS_IN = Input(shape=(T_MAX_SEQUENCE_LENGTH,), name='TAGS_IN')
TAGS_EMB = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=TAGS_X.shape[1], trainable = False) (TAGS_IN)
TAGS_SPD = SpatialDropout1D(0.5) (TAGS_EMB)
TAGS_LSTM = LSTM(100, dropout=0.5, recurrent_dropout=0.2,name="TAGS_LSTM", trainable = False) (TAGS_SPD)
TAGS_OUT = Dense(2, activation='softmax', name='TAGS_OUT') (TAGS_LSTM)

#### Comments

In [None]:
C_IN = Input(shape=(C_MAX_SEQUENCE_LENGTH,), name='C_IN')
C_EMB = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=C_X.shape[1], trainable = False) (C_IN)
C_SPD = SpatialDropout1D(0.5) (C_EMB)
C_LSTM = LSTM(100, dropout=0.5, recurrent_dropout=0.2,name="C_LSTM", trainable = False) (C_SPD)
C_OUT = Dense(2, activation='softmax', name='C_OUT') (C_LSTM)

#### Stats

In [None]:
STATS_IN = Input(shape=(STATS_X.shape[1],), name='STATS_IN')
STATS_1 = Dense(10, kernel_initializer='glorot_uniform', bias_initializer='zeros', activation='relu', name='STATS_1') (STATS_IN)
STATS_BATCH_1 = BatchNormalization() (STATS_1)
STATS_2 = Dense(100, kernel_initializer='glorot_uniform', bias_initializer='zeros', activation='relu', name='STATS_2') (STATS_BATCH_1)
STATS_BATCH_2 = BatchNormalization() (STATS_2)
STATS_OUT = Dense(2, kernel_initializer='glorot_uniform', bias_initializer='zeros', activation='sigmoid', name='STATS_OUT') (STATS_BATCH_2)



# STATS_IN = Sequential()
# classifier.add(Dense(output_dim = 10, kernel_initializer='glorot_uniform', bias_initializer='zeros', activation = 'relu', input_dim = 7))
# classifier.add(BatchNormalization())
# classifier.add(Dense(output_dim = 100,kernel_initializer='glorot_uniform', bias_initializer='zeros', activation = 'relu'))
# classifier.add(BatchNormalization())
# classifier.add(Dense(output_dim = 1,  kernel_initializer='glorot_uniform', bias_initializer='zeros', activation = 'sigmoid'))

#### Thumbnail

In [None]:
TB_IN = Input(shape=(IMGS_X_train.shape[1],IMGS_X_train.shape[2],IMGS_X_train.shape[3],), name='TB_IN')
TB_1 = Conv2D(64, (3, 3), padding='same', activation='relu') (TB_IN)
TB_2 = Conv2D(64, (3, 3), activation='relu', padding='same', trainable = False) (TB_1)
TB_3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2)) (TB_2)
TB_4 = Conv2D(128, (3, 3), activation='relu', padding='same', trainable = False) (TB_3)
TB_5 = Conv2D(128, (3, 3), activation='relu', padding='same', trainable = False) (TB_4)
TB_6 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2)) (TB_5)
TB_7 = Conv2D(256, (3, 3), activation='relu', padding='same', trainable = False) (TB_6)
TB_8 = Conv2D(256, (3, 3), activation='relu', padding='same', trainable = False) (TB_7)
TB_9 = Conv2D(256, (3, 3), activation='relu', padding='same', trainable = False) (TB_8)
TB_10 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2)) (TB_9)
TB_11 = Conv2D(512, (3, 3), activation='relu', padding='same', trainable = False) (TB_10)
TB_12 = Conv2D(512, (3, 3), activation='relu', padding='same', trainable = False) (TB_11)
TB_13 = Conv2D(512, (3, 3), activation='relu', padding='same', trainable = False) (TB_12)
TB_14 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2)) (TB_13)
TB_15 = Conv2D(512, (3, 3), activation='relu', padding='same', trainable = False) (TB_14)
TB_16 = Conv2D(512, (3, 3), activation='relu', padding='same', trainable = False) (TB_15)
TB_17 = Conv2D(512, (3, 3), activation='relu', padding='same', trainable = False) (TB_16)
TB_18 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2)) (TB_17)
TB_19 = Flatten() (TB_18)
TB_20 = Dense(4096, activation='relu', trainable = False) (TB_19)
TB_21 = Dense(4096, activation='relu', trainable = False) (TB_20)
TB_DENSE = Dense(100, activation='relu', trainable = False ) (TB_21)
TB_OUT = Dense(2, activation='softmax', name='TB_OUT') (TB_DENSE)

#### Video

### Combined Network

In [None]:
MERGED = Average()([AT_LSTM, T_LSTM, TAGS_LSTM, C_LSTM, STATS_BATCH_2, TB_DENSE ])
MERGED = Dense(300, activation='relu') (MERGED)
MERGED = Dropout(0.2) (MERGED)
MERGED = Dense(2, activation='softmax', name='FINAL_OUT') (MERGED)

In [None]:
MERGED_model = Model(inputs=[AT_IN,T_IN, TAGS_IN, C_IN, STATS_IN, TB_IN],outputs=[AT_OUT,T_OUT,TAGS_OUT, C_OUT,STATS_OUT,TB_OUT,MERGED])
MERGED_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
MERGED_model.summary()

In [None]:
epochs = 1
batch_size = 16


history = MERGED_model.fit(
    [AT_X_train,T_X_train, TAGS_X_train, C_X_train, STATS_X_train, IMGS_X_train],
    [AT_Y_train,T_Y_train, TAGS_Y_train, C_Y_train, STATS_y_train, IMGS_y_train, T_Y_train],
    epochs=epochs, 
    batch_size=batch_size,
    validation_split=0.1,
    callbacks=[EarlyStopping(monitor='val_loss', 
                             patience=3, 
                             min_delta=0.0001)
              ]
)





## Evaluating the Model

In [None]:
accr = MERGED_model.evaluate([AT_X_test,T_X_test,TAGS_X_test, C_X_test, STATS_X_test,IMGS_X_test],
                             [AT_Y_test,T_Y_test,TAGS_Y_test, C_Y_test, STATS_Y_test, IMGS_Y_test, T_Y_test])

In [None]:
accr

## Save the Model

In [None]:
MERGED_model.save('final_model/combined_model.h5')  # creates a HDF5 file

In [None]:
pickle.dump(AT_tokenizer, open('final_model/audio_transcript_tokenizer.pkl', 'wb') )
pickle.dump(T_tokenizer, open('final_model/title_tokenizer.pkl', 'wb') )
pickle.dump(TAGS_tokenizer, open('final_model/tags_tokenizer.pkl', 'wb') )
pickle.dump(C_tokenizer, open('final_model/comments_tokenizer.pkl', 'wb') )

pickle.dump(STATS_SC, open('final_model/stats_scaler.pkl', 'wb') )
pickle.dump(STATS_labelencoder, open('final_model/stats_labelencoder.pkl', 'wb') )

## Inference the Model

In [None]:
inference_video = {
    "title" : 'Formula 1 Engineering Tour',
    "audio_transcript" : 'hello there my name is Jonathan Edels Im the chief race engineer for Scuderia Toro Rosso I believe a couple of weeks ago our chief mechanic Tommy took you around the the garage introduced you to the garage garage operations what goes on in there Im hopefully going to do something similar but from the engineer insight trackside sir Ive just introduced you to the the pit wall what',
    'tags': '',
    'comments': '',
    'stats': '',
    'thumbnail' : '',
    'video' : ''
}


In [None]:
AT_MAX_SEQUENCE_LENGTH = 3000
A_seq = AT_tokenizer.texts_to_sequences([inference_video['audio_transcript']])
AT_padded = pad_sequences(A_seq, maxlen=AT_MAX_SEQUENCE_LENGTH)

T_MAX_SEQUENCE_LENGTH = 3000
T_seq = T_tokenizer.texts_to_sequences([inference_video['title']])
T_padded = pad_sequences(T_seq, maxlen=T_MAX_SEQUENCE_LENGTH)

TAGS_MAX_SEQUENCE_LENGTH = 3000
TAGS_seq = TAGS_tokenizer.texts_to_sequences([inference_video['tags']])
TAGS_padded = pad_sequences(TAGS_seq, maxlen=T_MAX_SEQUENCE_LENGTH)

C_MAX_SEQUENCE_LENGTH = 3000
C_seq = C_tokenizer.texts_to_sequences([inference_video['tags']])
C_padded = pad_sequences(C_seq, maxlen=C_MAX_SEQUENCE_LENGTH)



In [None]:
import time

start = time.time()
pred = MERGED_model.predict([AT_padded,T_padded, TAGS_padded, C_padded], verbose=1, steps=1)

time_taken = time.time() - start
labels = ["Non-ClickBait","ClickBait"]

print("Time Taken to predict : " + str(time_taken))

In [None]:
pred