# Data Preape 

In [1]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import one_hot 
from keras.callbacks import EarlyStopping
from nltk.tokenize import word_tokenize
from keras.utils import pad_sequences
from keras_tuner import HyperModel
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from keras.models import Model
from numpy.random import seed
from tensorflow import keras
from keras import layers
import tensorflow as tf
import keras_tuner as kt
import pandas as pd 
import numpy as np
import datetime
import re


MAX_WORD_LENGTH = 23
SEED = 12

## Embedding layrer prep 

In [2]:
ORIGINAL_DATA = "Merged_data.csv"

df = pd.read_csv(ORIGINAL_DATA,delimiter=",")
def preprocess_word_t(word_list):
    return word_tokenize(re.sub(r'[^a-zA-Z]', ' ', word_list))  
def uniq_val(clmn):
    return np.unique([wrd for bag_wrd in clmn for wrd in bag_wrd ] )

tf.keras.utils.set_random_seed(SEED)
seed(SEED)



### Feature extraction


In [3]:
def num_stripper(txt):
    return re.sub(r'[^a-zA-Z]', ' ', txt)


def pre_processing (clmn): 
    # Tokenizing and stop word removal from column
    procc_str = clmn.apply(lambda txt: [item for item in word_tokenize(num_stripper(txt)) if item.lower() not in stopwords.words('english')])
    # returning the embedding size of column
    embd_size = np.unique([wrd for bag_wrd in procc_str for wrd in bag_wrd ] )
    # Creating inverse dictionary for each unique word
    inv_dict  = dict(zip( embd_size,range(1,embd_size.size+1) ))
    # Return Processed strings and inverse dictionary 
    return [procc_str, inv_dict]

def onehot_padding(data,invs_dict):
    new_df , res  = [],[]
    # Loop over columns in dataframe
    for clmn in data:
        if clmn != 'Url' :
            # One hot encoding each row in column
            res= [[invs_dict[clmn][word] for word in strn] for strn in data[clmn] ]
        else:
            # One hot encoding each row in URL separately due to char removal
            res = [[invs_dict[clmn][key]] for key in data[clmn]]
        new_df.append(res) 
    # Padding sequence to max word length
    new_df = [pad_sequences(clmn,maxlen= MAX_WORD_LENGTH , padding='post')for clmn in new_df]
    return new_df
    

#### Removing stop words and counting embeding dimension

In [4]:
def df_conversion (df):
    w_dict = {}
    for clmn in df.iloc[:,:-1]:
        x = []
        if clmn not in ['Url' ,'Location' ]:
            df[clmn],w_dict[clmn] = pre_processing (df[clmn])
        elif clmn == 'Location':
            locl_list = [re.split(r',(?=.)', item)  for item in df[clmn]]
            df[clmn] = locl_list
            unique_items = list(set(cnty for county_lst in locl_list for cnty in county_lst))
            w_dict[clmn] = dict(zip(unique_items, range(1,len(unique_items)+1)))
        else:
            url_uniq = df[clmn].unique()
            w_dict[clmn] = dict(zip(url_uniq, range(1,len(url_uniq)+1)))
    return df , w_dict


In [5]:
def data_splitting(df):
    X = df.iloc[: ,:-1]
    # Target
    y = df['CTR']
    # # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15 ,random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size =  0.25 ,random_state=42)
    # Check the number of records in training and testing dataset.
    print(f'The training dataset shape {len(X_train)} and validation shape {len(X_val)} .')
    print(f'The testing dataset has {len(X_test)} records.')
    return X_train, X_test ,X_val, y_train, y_val ,y_train, y_test

In [6]:
df ,wrd_dict =  df_conversion(df)
X = df.iloc[: ,:-1]
# Target
y = df['CTR']

# # Train test split
X_train, X_test,X_val,y_train, y_val ,y_train, y_test = data_splitting(df)
# Check the number of records in training and testing dataset.
print(f'The training dataset has {X_train.shape} records and {X_train.shape} columns.')
print(f'The testing dataset has {len(X_test)} records.')


The training dataset shape 25 and validation shape 9 .
The testing dataset has 6 records.
The training dataset has (25, 13) records and (25, 13) columns.
The testing dataset has 6 records.


#### One hot encoding and padding

In [7]:
X_train = onehot_padding(X_train,wrd_dict)
X_val = onehot_padding(X_val,wrd_dict)


## Model making

### Model Tuning

In [8]:
class MyHyperModel(HyperModel):
    def __init__(self, max_vocab_size, max_sequence_length, num_inputs):
        self.max_vocab_size = max_vocab_size
        self.max_sequence_length = max_sequence_length
        self.num_inputs = num_inputs

    def build(self, hp):
        inputs = []
        outputs = []
        # Define input layer
        for sze,lst in zip(self.max_vocab_size,self.num_inputs) :
            embed_dim = int(min(np.ceil(self.max_vocab_size[sze] /2), 45))
            inp = layers.Input(shape= np.array(lst).shape[-1])
            out = layers.Embedding(self.max_vocab_size[sze]+1, embed_dim)(inp)
            out = layers.Dropout(hp.Float("intial drop",min_value = 0.1, max_value = 0.9))(out)
            inputs.append(inp)
            outputs.append(out)

        # Concatenate the embedding layers
        x = layers.Concatenate()(outputs)

        # Add Flatten layer
        x = layers.Flatten()(x)

        for i in range(hp.Int('num_of_layers',1,3)):         
            # providing range for number of neurons in hidden layers
            x = layers.Dense( units = hp.Int("units_" + str(i), min_value=32, max_value=512, step=16), activation='relu')(x)
            x = layers.Dropout(hp.Float("layer dropping",min_value = 0.1, max_value = 0.9))(x)
        x = layers.BatchNormalization()(x)

        # Add Dense layer
        y = layers.Dense(1, activation="linear")(x)
        model = Model(inputs=inputs, outputs=y)
        # Compile the model
        hp_learning_rate = hp.Float('learning_rate', min_value = 1e-4, max_value =1e-2, sampling='log')

        model.compile(optimizer=keras.optimizers.Adam(learning_rate= 0.00142),
                    loss='mean_squared_error',
                    metrics=['mae'])

        return model

### Early stopper

In [9]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
log_dir = "logs/fit/dropout_rate" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

### Model builder

In [10]:
embedding_sizes = {x : len(wrd_dict[x]) for x in wrd_dict}
mode_build = MyHyperModel(embedding_sizes,MAX_WORD_LENGTH,X_train)

### RandomSearch

In [11]:
rand_tuner = kt.RandomSearch(
    mode_build,
    objective='val_loss',
    max_trials=20,
    overwrite=True)

In [None]:
rand_tuner.search(X_train, y_train, epochs=565 ,validation_data=(X_val, y_val), callbacks = [stop_early],batch_size=32)


## Result printing

In [None]:
print(f'Random search results{rand_tuner.results_summary()}')