#Importing libraries

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Conv2D, GlobalMaxPooling2D, Concatenate, Dense, Flatten
from tensorflow.keras.models import Model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import requests
from io import BytesIO
import cv2

#Importing the train and test dataset

In [None]:
!gdown 174YvQO7mdg0tXF_e-gLkd0zpcKdwNbSz

Downloading...
From: https://drive.google.com/uc?id=174YvQO7mdg0tXF_e-gLkd0zpcKdwNbSz
To: /content/content_simulation_train.xlsx
100% 56.2M/56.2M [00:01<00:00, 47.6MB/s]


In [None]:
df = pd.read_excel("/content/content_simulation_train.xlsx")

In [None]:
df.columns

Index(['id', 'date', 'likes', 'content', 'username', 'media',
       'inferred company'],
      dtype='object')

In [None]:
import re
def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

def get_counts(df):
  mention = "<mention>"
  df['mention_count'] = df['content'].apply(lambda x: x.split().count(mention))
  #
  hyperlink = "<hyperlink>"
  df["hyperlink_count"] = df['content'].apply(lambda x: x.split().count(hyperlink))
  #
  df['hashtag_count'] = df['content'].apply(lambda x: sum([1 for word in x.split() if word.startswith('#')]))
  df["word_count"] = df['content'].apply(lambda x: sum([1 for word in x.split()]))
  df['content'] = df['content'].str.replace('<mention>', '')
  df['content'] = df['content'].str.replace('<hyperlink>', '')
  df['content'] = df['content'].str.replace('<mention>', '')
  df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if not word.startswith('#')]))
  return df

def get_media_urls(df):
  df['image_url'] = df['media'].str.extract(r"fullUrl='(.*?)'")
  import re
  df['video_url'] = df['media'].apply(lambda x: re.findall(r"url='(.*?)'", x)[1] if 'variants' in x and len(re.findall(r"url='(.*?)'", x)) > 1 else None)
  import ast
  df['thumbnail_url'] = df['media'].str.extract(r"thumbnailUrl='(.*?)'")
  df['duration'] = df['media'].apply(lambda x: float(re.search(r"duration=([\d.]+)", x).group(1)) if 'duration' in x else None)
  df['views'] = df['media'].apply(lambda x: int(re.search(r"views=(\d+)", x).group(1)) if re.search(r"views=(\d+)", x) else None)

  return df
def get_url(row):
  if isinstance(row['image_url'],str):
    return row['image_url']
  else:
    return row['thumbnail_url']

def infcorrection(df):
  df['inferred_company'] = df['inferred company']
  df.drop('inferred company',axis = 1)
  return df

import math
def get_views_duration(df):

  views = list(df['views'])
  result_list = [int(value) if not math.isnan(value) else 0 for value in views]
  df['views'] = result_list/df['views'].max()
  duration = list(df['duration'])
  result_list = [int(value) if not math.isnan(value) else 0 for value in duration]
  df['duration'] = result_list/df['duration'].max()

  return df

from datetime import datetime

# Assuming you have a DataFrame named df with a 'datetime' column
# Replace 'your_data.csv' with the actual file path or URL of your data
# df = pd.read_csv('your_data.csv')

# Convert 'datetime' column to datetime format if not already


# Function to create sine and cosine embeddings for time and date
def create_time_embeddings(df, column_name):

    seconds_in_day = 24 * 60 * 60
    seconds_in_year = 365.25 * 24 * 60 * 60

    # Calculate normalized values
    seconds_of_day = (df[column_name].dt.hour * 3600 + df[column_name].dt.minute * 60 + df[column_name].dt.second) / seconds_in_day
    seconds_of_year = (df[column_name] - df[column_name].min()).dt.total_seconds() / seconds_in_year

    # Create embeddings
    df['day_sin'] = np.sin(2 * np.pi * seconds_of_day)
    df['day_cos'] = np.cos(2 * np.pi * seconds_of_day)
    df['year_sin'] = np.sin(2 * np.pi * seconds_of_year)
    df['year_cos'] = np.cos(2 * np.pi * seconds_of_year)

    df['week_sin'] = np.sin((df['day']*2*np.pi)/7)
    df['week_cos'] = np.cos((df['day']*2*np.pi)/7)
    # df['week_sin'] =
    # df['week_cos'] =
    return df

# Apply the function to create embeddings for 'datetime' column
# df = create_time_embeddings(df, 'datetime')

# from datetime import datetime
def uptime(date):
  ref = '2023-11-21 00:00:00'
  refdate = pd.to_datetime(ref)
  reftime = refdate.timestamp()
  return reftime - date.timestamp()
def get_uptime(df):
  df['uptime'] = df['datetime'].apply(uptime)
  return df

def normalize_uptime(df):
  uptime = list(df['uptime'])
  upt = [int(value) if not math.isnan(value) else 0 for value in uptime]
  df['uptime'] = upt/df['uptime'].max()
  return df

def getweekday(date):
  return date.weekday()

def get_weekday_df(df):
  df['datetime'] = pd.to_datetime(df['date'])
  df['day'] = df['datetime'].apply(getweekday)
  return df

In [None]:
def master_eda(df):
  df = df.dropna(subset=['content'])
  df = get_counts(df)
  df['content'] = df['content'].apply(remove_emojis)
  df = get_media_urls(df)
  df['url'] = df.apply(get_url,axis = 1)
  df = infcorrection(df)
  df = get_views_duration(df)
  df = get_weekday_df(df)
  df = create_time_embeddings(df,'datetime')
  df = get_uptime(df)
  df = normalize_uptime(df)
  return df[['id','likes', 'content', 'username', 'mention_count', 'hyperlink_count', 'hashtag_count',
       'word_count', 'duration',
       'views', 'url', 'inferred_company', 'day_sin', 'day_cos',
       'year_sin', 'year_cos', 'uptime', 'week_sin', 'week_cos']]







In [None]:
df = master_eda(df)

In [None]:
df.head()

Unnamed: 0,id,likes,content,username,mention_count,hyperlink_count,hashtag_count,word_count,duration,views,url,inferred_company,day_sin,day_cos,year_sin,year_cos,uptime,week_sin,week_cos
0,1,1,"Spend your weekend morning with a Ham, Egg, an...",TimHortonsPH,0,1,1,29,0.0,0.0,https://pbs.twimg.com/media/Eo8N3JLVoAAlDJT?fo...,tim hortons,0.203642,0.979045,-0.332817,0.942992,0.499521,-0.974928,-0.222521
1,2,2750,Watch rapper freestyle for over an HOUR,IndyMusic,1,2,0,10,0.0,0.0,https://pbs.twimg.com/media/Dg7mW-VX0AE_hMn?fo...,independent,0.483537,-0.875324,0.03803,-0.999277,0.916086,-0.974928,-0.222521
2,3,57,Canadian Armenian community demands ban on mil...,CBCCanada,0,2,0,14,0.0,0.0,https://pbs.twimg.com/media/EjG2s4aXgAMNM1o?fo...,cbc,-0.892061,0.451916,-0.999612,-0.027844,0.533571,0.781831,0.62349
3,4,152,"1st in Europe to be devastated by COVID-19, It...",MKWilliamsRome,0,2,0,22,0.0,0.0,https://pbs.twimg.com/media/EjPaVniX0AAaWLJ?fo...,williams,0.086504,-0.996252,-1.0,0.000735,0.532799,0.433884,-0.900969
4,5,41,Congratulations to Pauletha Butts of ! She was...,BGISD,0,1,1,26,0.0,0.0,https://pbs.twimg.com/media/Dp4L0cSUcAAh9JG?fo...,independent,-0.611412,-0.791312,-0.954202,0.299164,0.864372,-0.433884,-0.900969


Normalize ze data

#Importing the encoder for text embeddings

In [None]:
# Load the Universal Sentence Encoder from TensorFlow Hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

#Creating a Regression Generator

In [None]:
import random, glob, os
import numpy as np
from tqdm.notebook import tqdm
import typing
try :
    from keras.utils import Sequence #   sequence =  keras.utils.Sequence
except:
    from keras.utils.all_utils import Sequence

# ==================================================
class RegressionGenerator(Sequence):
    '''
    Dataset generator for Regression.
    '''
    def __init__(self, df_X : pd.DataFrame, df_Y : pd.DataFrame, width : int = 224, height : int = 224,
                 channels : int = 3, embedding_dim : int = 512, batch_size : int = 32,
                #  encoder : Model = embed,
                 features : typing.List = ['content', 'mention_count', 'hashtag_count', 'url','day_sin',
       'day_cos', 'week_sin', 'week_cos', 'year_sin', 'year_cos','uptime']) :
        """
        It is expected that the dataframe has already been split. As we are not generating any derivative data,
        there should be not need for different 'types' of generator. And I am lazy.
        """
        self.width, self.height, self.channels = width, height, channels  # Adjust as needed
        self.embedding_dim = embedding_dim  # Adjust as needed
        self.x_df = df_X[features]
        self.y_df = df_Y
        self.features = features
        self.batch_size = batch_size
        # self.encoder = encoder
    # ==================================
    def __getitem__(self, index):
        # Disabling 4 threads for now
        batch_x = self.x_df[index*self.batch_size:(index+1)*self.batch_size]
        batch_y = self.y_df[index*self.batch_size:(index+1)*self.batch_size].to_numpy(dtype=float)
        contents = [] #!
        images = []
        others = []

        for index, row in batch_x.iterrows():
          contents.append(row['content'])
          images.append(self.read_data(row['url']))
          others.append(np.array(row[['mention_count', 'hashtag_count', 'day_sin','day_cos', 'week_sin', 'week_cos', 'year_sin', 'year_cos', 'uptime']]))

        images = tf.convert_to_tensor(images)
        others = tf.convert_to_tensor(others)
        contents = tf.convert_to_tensor(embed(contents))

        return [contents, images, others ], batch_y


    def __len__(self):
      return len(self.x_df)//self.batch_size

    def read_data (self, url) :
        '''
        Return the np.array of image @url
        '''
        response = requests.get(url, stream=True)
    # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Convert the image content to a NumPy array
            image_array = np.asarray(bytearray(response.content), dtype=np.uint8)

            # Decode the NumPy array to an OpenCV image
            image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)

            # Resize the image to the desired dimensions
            image = cv2.resize(image, (self.width, self.height))

            # Normalize pixel values to the range [0, 1]
            image = image / 255.0

            # Return the preprocessed image
            return image


        return np.zeros((self.width, self.height, self.channels))



In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, df['likes'], test_size=0.2, random_state=42)

In [None]:
train_features = ['content', 'mention_count', 'hashtag_count', 'url', 'day_sin','day_cos', 'week_sin', 'week_cos', 'year_sin', 'year_cos','uptime']

In [None]:
training_gen = RegressionGenerator(X_train[train_features], y_train)
val_gen = RegressionGenerator(X_test[train_features], y_test)

In [None]:
from keras.applications.vgg19 import VGG19, preprocess_input
from keras.preprocessing import image
from keras.models import Model
import numpy as np
import requests

# Load VGG19 model pre-trained on ImageNet data
base_model = VGG19(weights='imagenet')
for layer in base_model.layers:
    layer.trainable = False

# x = Dense(1024,activation = 'relu')(base_model.get_layer('fc2').output)
# Remove the last classification layer
vgg19 = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
vgg19.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [None]:
# def text_processor(embed_dim : int = 512, params = [(128, 'relu')]):
#   input_tensor = Input(shape = (embed_dim, ))
#   x = input_tensor
#   for dim, activ in params:
#     x = Dense(dim, activation = activ)(x)
#   output_tensor = x
#   model = tf.keras.Model(input_tensor, output_tensor)
#   return model


In [None]:
def regression_model(embed_dim : int = 512, img_dim : typing.Tuple = (224,224,3), params = [(256, 'relu'),(128,'relu'), (1, 'relu')]):
  img_proc = vgg19
  text_input = Input(shape = (embed_dim,))
  other_input = Input(shape = (9, ))
  input_tensor = tf.keras.layers.concatenate([img_proc.output, text_input,other_input])

  x = input_tensor
  for dim, activ in params:
    x = Dense(dim, activation = activ)(x)
  output_tensor = x
  model = tf.keras.Model([text_input, img_proc.input,other_input], output_tensor)
  print(img_proc.input)
  return model


In [None]:
reg  = regression_model()

KerasTensor(type_spec=TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name='input_1'), name='input_1', description="created by layer 'input_1'")


In [None]:
reg.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-3), loss='mean_squared_error', metrics=['mae'])

In [None]:
reg.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 block1_conv1 (Conv2D)       (None, 224, 224, 64)         1792      ['input_1[0][0]']             
                                                                                                  
 block1_conv2 (Conv2D)       (None, 224, 224, 64)         36928     ['block1_conv1[0][0]']        
                                                                                                  
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)         0         ['block1_conv2[0][0]']        
                                                                                            

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint_path = 'model_checkpoint.h5'  # Path to save the checkpoint
checkpoint = ModelCheckpoint(checkpoint_path,
                             monitor='val_loss',  # Monitor validation loss
                             save_best_only=False,  # Save the best model
                             mode='min',  # Mode to determine best model (minimize loss)
                             verbose=1)  # Verbosity level

In [None]:
history = reg.fit(training_gen, validation_data = val_gen, epochs = 10,callbacks = [checkpoint])

Epoch 1/10
   3/7500 [..............................] - ETA: 66:55:13 - loss: 7090832.5000 - mae: 778.2292  

KeyboardInterrupt: ignored

In [None]:
model_path = '/content/model_checkpoint.h5'
model = tf.keras.models.load_model(model_path)

OSError: ignored

In [None]:
prediction = model.predict(y_train)