In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#      for filename in filenames:
#          print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords 
import re 
import string 
import tensorflow as tf 
from tensorflow.keras.models import Model 
from tensorflow.keras.layers import Conv2D , MaxPooling2D , Dense  , BatchNormalization , LSTM  , Concatenate , Input,Embedding,Flatten
from tensorflow.keras.applications import resnet_v2 


In [None]:
# load the data 
numric_train_data = pd.read_parquet('C:/Users/DELL/Downloads/kaggle-pog-series-s01e01/train.parquet')
numric_test_data = pd.read_parquet('C:/Users/DELL/Downloads/kaggle-pog-series-s01e01/test.parquet')

# identify key columns 
text_columns = ['title' , 'tags' ,'description' ]
categrical_data = ['categoryId' , 'comments_disabled' , 'ratings_disabled']
numric_columns = ['view_count' , 'likes' , 'dislikes' , 'comment_count' , 'duration_seconds']

# function to process the data
def process_data(data) :
    data['duration_seconds'] = data['duration_seconds'].fillna(data['duration_seconds'].mean())
    data[categrical_data] = data[categrical_data].astype('category')
    cats_columns = pd.get_dummies(data[categrical_data]) 
    data = data.drop(categrical_data , axis = 1)
    data = pd.concat([data , cats_columns] , axis = 1)
    data = data.drop([ 'has_thumbnail' , 'channelId' , 'channelTitle' , 'id'] , axis = 1)
    data['trending_date'] = pd.to_datetime(data['trending_date'])
    return data

train_data_updated = process_data(numric_train_data)
test_data_updated = process_data(numric_test_data)

# loading the images paths and 
from pathlib import Path
main = Path('C:/Users/DELL/Downloads/kaggle-pog-series-s01e01/thumbnails')
images_paths = list(main.glob(r'**/*.jpg'))
images_id = list(map(lambda x: os.path.split(os.path.split(x)[1])[1], images_paths))
images_S = pd.Series(images_paths , name='path').astype('str')
id_S = pd.Series(images_id , name='id')
images_df = pd.concat([images_S , id_S] , axis = 1 )
images_df['id'] = images_df['id'].apply(lambda X : X.replace('.jpg' , ''))

# merge the two dataframe to map each row to his image
train_data_frame = images_df.merge(train_data_updated , how ='inner' , left_on ='id' , right_on ='video_id')
test_data_frame = images_df.merge(test_data_updated , how ='inner' , left_on ='id' , right_on ='video_id')


scaler = StandardScaler()
train_data_updated[numric_columns] = scaler.fit_transform(train_data_updated[numric_columns]) 

test_data_frame['description'] = test_data_frame['description'].fillna('empty')
train_data_frame['description'] = train_data_frame['description'].fillna('empty')


In [None]:
# some text processing 
def remove_punc(text): 
    return re.sub('\[[^]]*\]', '', text)
def remove_chars(text):
    return re.sub("[^a-zA-Z]"," ",text)
def remove_stop_words(text):
    cleaned = []
    text = text.lower()
    text = nltk.word_tokenize(text)
    sp = nltk.stem.PorterStemmer()
    for word in text :
        if word not in set(stopwords.words('english')) : 
            word = sp.stem(word)
            cleaned.append(word)
    return ' '.join(cleaned)
def cleaning(text):
#     text = remove_html(text)
    text = remove_chars(text)
    text = remove_stop_words(text)
    return text 


In [None]:
# take subset of the data because of the computional cost # you can change it
short_data_frame = train_data_frame.iloc[:5000 ,:]

# applying text process in train and test data
for col in text_columns : 
    short_data_frame[col] = short_data_frame[col].apply(cleaning)
    test_data_frame[col] = test_data_frame[col].apply(cleaning)
    
    
max_f = 10000
max_len = 300
text_features = {}

from tensorflow.keras.preprocessing import text , sequence 
tock = text.Tokenizer(num_words = max_f)
for col in text_columns :
    tock.fit_on_texts(short_data_frame[col])
    text_features[col] = tock.texts_to_sequences(short_data_frame[col])
    text_features[col] = sequence.pad_sequences(text_features[col] , maxlen = max_len)
text_train_data = np.concatenate([text_features[text_columns[0]] , text_features[text_columns[1]],text_features[text_columns[2]]] ,axis =1)

test_text_features = {}
for col in text_columns :
    tock.fit_on_texts(test_data_frame[col])
    test_text_features[col] = tock.texts_to_sequences(test_data_frame[col])
    test_text_features[col] = sequence.pad_sequences(test_text_features[col] , maxlen = max_len)
text_test_data = np.concatenate([test_text_features[text_columns[0]] , test_text_features[text_columns[1]],test_text_features[text_columns[2]]] ,axis =1)

In [None]:
# load and process images

train_img_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    preprocessing_function = tf.keras.applications.resnet_v2.preprocess_input
)
test_img_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
)
X_img_train = train_img_generator.flow_from_dataframe(
    dataframe = short_data_frame , x_col = 'path' ,
    target_size=(224, 224),
    color_mode='rgb',
    class_mode=None,
    batch_size=32,
    shuffle= False , 
    seed=42,
    subset='training'
)

X_img_test = test_img_generator.flow_from_dataframe(
    dataframe = test_data_frame ,x_col = 'path' , 
    target_size = (244 ,244 ) , color_mode = 'rgb' , 
    class_mode = None , batch_size = 32 ,
    shuffle = False , 
    seed = 42 
)

#convert images into numpy to avoid adapting error while training the model 
X_img_train=np.concatenate([X_img_train.next() for i in range(X_img_train.__len__())])
X_img_test = np.concatenate([X_img_test.next() for i in range(X_img_test.__len__())])

In [None]:
# set the numric data 
numrical_data  =short_data_frame[[ 'categoryId_1',
       'categoryId_2', 'categoryId_10',  'categoryId_17',
        'categoryId_20', 'categoryId_22', 'categoryId_23',
       'categoryId_24', 'categoryId_25', 'categoryId_26', 'categoryId_27',
       'categoryId_28', 'comments_disabled_False',
       'comments_disabled_True', 'ratings_disabled_False',
       'ratings_disabled_True','duration_seconds']]
numrical_data['publish_year' ] = short_data_frame['publishedAt'].dt.year
numrical_data['trend_year' ] = short_data_frame['trending_date'].dt.year
numrical_data['publish_month' ] = short_data_frame['publishedAt'].dt.month
numrical_data['trend_month' ] = short_data_frame['trending_date'].dt.month
numrical_data['publish_day' ] = short_data_frame['publishedAt'].dt.day
numrical_data['trend_day' ] = short_data_frame['trending_date'].dt.day
numrical_data = np.array(numrical_data)
numrical_data = tf.cast(numrical_data , tf.float32)

numrical_test_data = test_data_frame[[ 'categoryId_1',
       'categoryId_2', 'categoryId_10', 'categoryId_17',
    'categoryId_20', 'categoryId_22', 'categoryId_23',
       'categoryId_24', 'categoryId_25', 'categoryId_26', 'categoryId_27',
       'categoryId_28', 'comments_disabled_False',
       'comments_disabled_True', 'ratings_disabled_False',
       'ratings_disabled_True','duration_seconds']]
numrical_test_data['publish_year' ] = test_data_frame['publishedAt'].dt.year
numrical_test_data['trend_year' ] = test_data_frame['trending_date'].dt.year
numrical_test_data['publish_month' ] = test_data_frame['publishedAt'].dt.month
numrical_test_data['trend_month' ] = test_data_frame['trending_date'].dt.month
numrical_test_data['publish_day' ] = test_data_frame['publishedAt'].dt.day
numrical_test_data['trend_day' ] = test_data_frame['trending_date'].dt.day
numrical_test_data = np.array(numrical_test_data)
numrical_test_data = tf.cast(numrical_test_data , tf.float32)


In [None]:
labels = np.array(short_data_frame['target']) 

In [None]:

image_feature_extractor = resnet_v2.ResNet152V2(include_top = False , weights ='imagenet' , input_shape=(224 ,224 ,3))
image_feature_extractor.trainable=False

In [None]:
# the model 

image_feature_extractor = resnet_v2.ResNet152V2(include_top = False , weights ='imagenet' , input_shape=(224 ,224 ,3))
image_feature_extractor.trainable=False

# img layers 
img_input = Input((224,224,3))
img_features = image_feature_extractor(img_input)
img_features = tf.keras.layers.Flatten()(img_features) 
img_features = Dense(256 , activation ='relu')(img_features)
img_features = BatchNormalization()(img_features)
#######

# numric layers 
numric_input = Input((30,))
numric_features = Dense(256 , activation='relu')(numric_input)
numric_features = Dense(256 , activation='relu')(numric_features)
numric_features = BatchNormalization()(numric_features)
numric_features = Dense(128 , activation='relu')(numric_features)
numric_features = Dense(128 , activation='relu')(numric_features)
numric_features = BatchNormalization()(numric_features)
#####

# text_layers 
text_input = Input((900,))
embed =Embedding(max_f , 100 )(text_input)
lstm = LSTM(50)(embed)
text_features =  Dense(265 , activation='relu')(lstm)
text_features =  Dense(128 , activation='relu')(text_features)
text_features = BatchNormalization()(text_features)
#######


cocat = Concatenate()([img_features,numric_features ,text_features])
X = Dense(265 , activation = 'relu')(cocat)
X = Dense(128 , activation = 'relu')(X)
X = BatchNormalization()(X)
X = Dense(64 , activation ='relu')(X)
output = Dense(1)(X)
model = Model([ img_input,numric_input , text_input] , output) 


In [None]:
model.compile(optimizer = 'adam' , loss =['mae', 'mae' ,'mae'] , metrics=['mae'])

In [None]:
model.fit( [ X_img_train[:4000], numrical_data[:4000], text_train_data[:4000]] ,labels[:4000] , validation_data = ([X_img_train[4000:], numrical_data[4000:], text_train_data[4000:]] , labels[4000:]) , epochs =50 )

In [None]:
preds = model.predict([X_img_test , text_test_data , numrical_test_data])