In [1]:
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag, word_tokenize
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
import pickle
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM,
    GRU, Dense, Dropout, Concatenate
)
from tensorflow.keras.optimizers import Adam
import time
import os

In [2]:
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

True

In [None]:
def nltk_pos_to_wordnet(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def clean_and_lemmatize(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|@\S+|#\S+|[^a-z\s]', ' ', text)

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]

    tagged = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for word, tag in tagged:
        wn_tag = nltk_pos_to_wordnet(tag)
        lemma = lemmatizer.lemmatize(word, pos=wn_tag) if wn_tag else lemmatizer.lemmatize(word)
        lemmatized.append(lemma)

    return " ".join(lemmatized)

def word_parse(soup):
    title_tag = soup.find('title')
    title_text = title_tag.get_text(separator=' ', strip=True) if title_tag else ""

    body_tag = soup.find('body')
    body_text = body_tag.get_text(separator=' ', strip=True) if body_tag else ""

    parsed_title = clean_and_lemmatize(title_text)
    parsed_content = clean_and_lemmatize(body_text)

    return parsed_title, parsed_content

In [4]:
def decorate_message(message : str):
    print('*'*len(message))
    print(message)
    print('*'*len(message))

In [5]:
dom_folder = './doms/'

for i in range(8):
    input_filename = f'dom_data{i}.pkl'
    output_folder = './parse/'
    output_filename = f'parse{i}.pkl'
    
    if os.path.exists(output_folder+output_filename):
        print(f"{output_filename} already computed")
        continue
    
    decorate_message(f"Parsing and Lemmatization for: {input_filename}")
    
    beginning = start = time.time()
    print(f"Reading from {input_filename}")
    with open(dom_folder+input_filename, 'rb') as dom_file:
        dom = pickle.load(dom_file)
    print(f"Time taken: {time.time() - start:.2f}s")
        
    output = dom[:]
    total_data = len(output)
    start = time.time()
    
    for index in range(total_data):
        parsed_title, parsed_content = word_parse(output[index][2])
        
        output[index].append(output[index][3])
        output[index][2] = parsed_title
        output[index][3] = parsed_content
        
        end = time.time()
        if (index%1000 == 0 and index) or end - start > 30:
            print(f"Current Progress: {index}/{total_data}, Time taken: {end - start:.2f}")
            start = time.time()
        
    
    print(f"Writing to {output_filename}")
    start = time.time()
    with open(output_folder+output_filename, 'wb') as output_file:
        pickle.dump(output, output_file)
    print(f"Time taken: {time.time() - start:.2f}")
    print(f"Total time taken: {time.time() - beginning:.2f}")
    

parse0.pkl already computed
parse1.pkl already computed
parse2.pkl already computed
parse3.pkl already computed
parse4.pkl already computed
parse5.pkl already computed
parse6.pkl already computed
parse7.pkl already computed


In [6]:
page_titles = []
page_contents = []

for i in range(8):
    with open(f'./parse/parse{i}.pkl', 'rb') as parse_file:
        parse_list = pickle.load(parse_file)
    
    for item in parse_list:
        page_titles.append(item[2])
        page_contents.append(item[3])


In [None]:
def prepare_inputs(page_titles, page_contents, max_title_len=10, max_content_len=100, vocab_size=10000):
    all_texts = page_titles + page_contents

    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(all_texts)

    title_seq = tokenizer.texts_to_sequences(page_titles)
    content_seq = tokenizer.texts_to_sequences(page_contents)

    title_pad = pad_sequences(title_seq, maxlen=max_title_len, padding='post', truncating='post')
    content_pad = pad_sequences(content_seq, maxlen=max_content_len, padding='post', truncating='post')

    return tokenizer, title_pad, content_pad

In [8]:
tokenizer, title_pad, content_pad = prepare_inputs(page_titles, page_contents)

In [9]:
input_data_path = './input_data/'

if not os.path.exists(input_data_path+'tokenizer.pkl'):
    with open(input_data_path+'tokenizer.pkl', 'wb') as tokenizer_file:
        pickle.dump(tokenizer, tokenizer_file)

In [10]:
def load_glove_embeddings(glove_path="glove.6B.100d.txt"):
    embeddings_index = {}
    with open(glove_path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Loaded {len(embeddings_index):,} word vectors from GloVe.")
    return embeddings_index

def build_embedding_matrix(tokenizer, embeddings_index, embedding_dim=100):
    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print("Embedding matrix shape:", embedding_matrix.shape)
    return embedding_matrix

In [11]:
glove_index = load_glove_embeddings("glove.6B.100d.txt")
embedding_matrix = build_embedding_matrix(tokenizer, glove_index, embedding_dim=100)

if not os.path.exists(input_data_path+'embedding_matrix.pkl'):
    with open(input_data_path+'embedding_matrix.pkl', 'wb') as embedding_matrix_file:
        pickle.dump(embedding_matrix, embedding_matrix_file)

Loaded 400,000 word vectors from GloVe.
Embedding matrix shape: (747931, 100)


In [12]:
input_data = []

for i in range(8):
    input_filename = f'parse{i}.pkl'
    
    with open('./parse/'+input_filename, 'rb') as parse_file:
        parse_list = pickle.load(parse_file)
        
    input_data.extend(parse_list)

for index in range(len(input_data)):
    input_data[index][2] = title_pad[index]
    input_data[index][3] = content_pad[index]

if not os.path.exists(input_data_path+'input_data.pkl'):
    with open(input_data_path+'input_data.pkl', 'wb') as input_data_file:
        pickle.dump(input_data, input_data_file)

In [13]:
labels = [data[4] for data in input_data]
labels = np.array(labels, dtype=np.float32)

In [14]:
def build_bilstm_model(vocab_size, embedding_dim, max_title_len, max_content_len, embedding_matrix):
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=None,
        trainable=False,
        name="shared_embedding"
    )

    # Inputs
    title_input = Input(shape=(max_title_len,), name="title_input")
    content_input = Input(shape=(max_content_len,), name="content_input")

    # Shared embedding + BiLSTM
    title_emb = embedding_layer(title_input)
    content_emb = embedding_layer(content_input)

    title_bilstm = Bidirectional(LSTM(64))(title_emb)
    content_bilstm = Bidirectional(LSTM(128))(content_emb)

    # Merge both
    merged = Concatenate()([title_bilstm, content_bilstm])
    merged = Dropout(0.5)(merged)
    merged = Dense(128, activation='relu')(merged)
    merged = Dropout(0.5)(merged)
    output = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[title_input, content_input], outputs=output)
    model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [15]:
vocab_size = len(tokenizer.word_index) + 1
model = build_bilstm_model(
    vocab_size=vocab_size,
    embedding_dim=100,
    max_title_len=10,
    max_content_len=100,
    embedding_matrix=embedding_matrix
)
if not os.path.exists('./model/BiLSTM.pkl'):
    model.fit(
        {"title_input": title_pad, "content_input": content_pad},
        labels,
        epochs=10,
        batch_size=32,
        validation_split=0.2
    )

In [None]:
if not os.path.exists('./model/BiLSTM.pkl'):
    with open('./model/BiLSTM.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
else:
    with open('./model/BiLSTM.pkl', 'rb') as model_file:
        model = pickle.load(model_file)

In [17]:
predicted_labels = model.predict({
    "title_input": title_pad,
    "content_input": content_pad
})

[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 7ms/step


In [18]:
predictions_df = pd.DataFrame({
    "url":[data[1] for data in input_data],
    "predicted_label": [round(data[0]) for data in predicted_labels],
    "actual_label": [data[4] for data in input_data]
})

predictions_df.to_csv("./model/content_bilstm_predictions.csv")

In [19]:
def create_shared_embedding(vocab_size, embedding_dim, embedding_matrix):
    return Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=None,
        trainable=False,
        name="shared_embedding"
    )


In [20]:
def build_lstm_model(vocab_size, embedding_dim, max_title_len, max_content_len, embedding_matrix):
    embedding_layer = create_shared_embedding(vocab_size, embedding_dim, embedding_matrix)

    # Inputs
    title_input = Input(shape=(max_title_len,), name="title_input")
    content_input = Input(shape=(max_content_len,), name="content_input")

    # Embedding + LSTM
    title_emb = embedding_layer(title_input)
    content_emb = embedding_layer(content_input)

    title_lstm = LSTM(128)(title_emb)
    content_lstm = LSTM(256)(content_emb)

    merged = Concatenate()([title_lstm, content_lstm])
    merged = Dropout(0.5)(merged)
    merged = Dense(128, activation='relu')(merged)
    merged = Dropout(0.5)(merged)
    output = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[title_input, content_input], outputs=output)
    model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [21]:
def build_gru_model(vocab_size, embedding_dim, max_title_len, max_content_len, embedding_matrix):
    embedding_layer = create_shared_embedding(vocab_size, embedding_dim, embedding_matrix)

    title_input = Input(shape=(max_title_len,), name="title_input")
    content_input = Input(shape=(max_content_len,), name="content_input")

    title_emb = embedding_layer(title_input)
    content_emb = embedding_layer(content_input)

    title_gru = GRU(128)(title_emb)
    content_gru = GRU(256)(content_emb)

    merged = Concatenate()([title_gru, content_gru])
    merged = Dropout(0.5)(merged)
    merged = Dense(128, activation='relu')(merged)
    merged = Dropout(0.5)(merged)
    output = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[title_input, content_input], outputs=output)
    model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [22]:
def build_bigru_model(vocab_size, embedding_dim, max_title_len, max_content_len, embedding_matrix):
    embedding_layer = create_shared_embedding(vocab_size, embedding_dim, embedding_matrix)

    title_input = Input(shape=(max_title_len,), name="title_input")
    content_input = Input(shape=(max_content_len,), name="content_input")

    title_emb = embedding_layer(title_input)
    content_emb = embedding_layer(content_input)

    title_bigru = Bidirectional(GRU(64))(title_emb)
    content_bigru = Bidirectional(GRU(128))(content_emb)

    merged = Concatenate()([title_bigru, content_bigru])
    merged = Dropout(0.5)(merged)
    merged = Dense(128, activation='relu')(merged)
    merged = Dropout(0.5)(merged)
    output = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[title_input, content_input], outputs=output)
    model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [23]:
model = build_lstm_model(
    vocab_size=vocab_size,
    embedding_dim=100,
    max_title_len=10,
    max_content_len=100,
    embedding_matrix=embedding_matrix
)
if not os.path.exists('./model/LSTM.pkl'):
    model.fit(
        {"title_input": title_pad, "content_input": content_pad},
        labels,
        epochs=10,
        batch_size=32,
        validation_split=0.2
    )

Epoch 1/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 28ms/step - accuracy: 0.8785 - loss: 0.2971 - val_accuracy: 0.9188 - val_loss: 0.2081
Epoch 2/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 28ms/step - accuracy: 0.9245 - loss: 0.2010 - val_accuracy: 0.9371 - val_loss: 0.1724
Epoch 3/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 29ms/step - accuracy: 0.9374 - loss: 0.1677 - val_accuracy: 0.9371 - val_loss: 0.1636
Epoch 4/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 35ms/step - accuracy: 0.9460 - loss: 0.1463 - val_accuracy: 0.9430 - val_loss: 0.1588
Epoch 5/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 37ms/step - accuracy: 0.9515 - loss: 0.1312 - val_accuracy: 0.9445 - val_loss: 0.1564
Epoch 6/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 29ms/step - accuracy: 0.9569 - loss: 0.1166 - val_accuracy: 0.9464 - val_loss: 0.1535
Epoc

In [24]:
if not os.path.exists('./model/LSTM.pkl'):
    with open('./model/LSTM.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
else:
    with open('./model/LSTM.pkl', 'rb') as model_file:
        model = pickle.load(model_file)

In [25]:
predictions_df = pd.DataFrame({
    "url":[data[1] for data in input_data],
    "predicted_label": [round(data[0]) for data in predicted_labels],
    "actual_label": [data[4] for data in input_data]
})

predictions_df.to_csv("./model/content_lstm_predictions.csv")

In [26]:
model = build_gru_model(
    vocab_size=vocab_size,
    embedding_dim=100,
    max_title_len=10,
    max_content_len=100,
    embedding_matrix=embedding_matrix
)
if not os.path.exists('./model/GRU.pkl'):
    model.fit(
        {"title_input": title_pad, "content_input": content_pad},
        labels,
        epochs=10,
        batch_size=32,
        validation_split=0.2
    )

Epoch 1/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 28ms/step - accuracy: 0.8706 - loss: 0.3047 - val_accuracy: 0.9295 - val_loss: 0.1859
Epoch 2/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 29ms/step - accuracy: 0.9349 - loss: 0.1791 - val_accuracy: 0.9405 - val_loss: 0.1618
Epoch 3/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 30ms/step - accuracy: 0.9471 - loss: 0.1462 - val_accuracy: 0.9491 - val_loss: 0.1463
Epoch 4/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 41ms/step - accuracy: 0.9572 - loss: 0.1196 - val_accuracy: 0.9524 - val_loss: 0.1401
Epoch 5/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 41ms/step - accuracy: 0.9660 - loss: 0.0982 - val_accuracy: 0.9579 - val_loss: 0.1367
Epoch 6/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 30ms/step - accuracy: 0.9724 - loss: 0.0803 - val_accuracy: 0.9569 - val_loss: 0.1467
Epoc

In [27]:
if not os.path.exists('./model/GRU.pkl'):
    with open('./model/GRU.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
else:
    with open('./model/GRU.pkl', 'rb') as model_file:
        model = pickle.load(model_file)

In [28]:
predicted_labels = model.predict({
    "title_input": title_pad,
    "content_input": content_pad
})

[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 14ms/step


In [29]:
predictions_df = pd.DataFrame({
    "url":[data[1] for data in input_data],
    "predicted_label": [round(data[0]) for data in predicted_labels],
    "actual_label": [data[4] for data in input_data]
})

predictions_df.to_csv("./model/content_gru_predictions.csv")

In [30]:
model = build_bigru_model(
    vocab_size=vocab_size,
    embedding_dim=100,
    max_title_len=10,
    max_content_len=100,
    embedding_matrix=embedding_matrix
)
if not os.path.exists('./model/BiGRU.pkl'):
    model.fit(
        {"title_input": title_pad, "content_input": content_pad},
        labels,
        epochs=10,
        batch_size=32,
        validation_split=0.2
    )

Epoch 1/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 30ms/step - accuracy: 0.8815 - loss: 0.2798 - val_accuracy: 0.9406 - val_loss: 0.1611
Epoch 2/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 28ms/step - accuracy: 0.9431 - loss: 0.1591 - val_accuracy: 0.9472 - val_loss: 0.1488
Epoch 3/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 30ms/step - accuracy: 0.9539 - loss: 0.1310 - val_accuracy: 0.9549 - val_loss: 0.1351
Epoch 4/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 32ms/step - accuracy: 0.9633 - loss: 0.1043 - val_accuracy: 0.9542 - val_loss: 0.1327
Epoch 5/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 31ms/step - accuracy: 0.9689 - loss: 0.0886 - val_accuracy: 0.9577 - val_loss: 0.1306
Epoch 6/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 32ms/step - accuracy: 0.9730 - loss: 0.0765 - val_accuracy: 0.9592 - val_loss: 0.1339
Epoc

In [31]:
if not os.path.exists('./model/BiGRU.pkl'):
    with open('./model/BiGRU.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
else:
    with open('./model/BiGRU.pkl', 'rb') as model_file:
        model = pickle.load(model_file)

In [32]:
predicted_labels = model.predict({
    "title_input": title_pad,
    "content_input": content_pad
})

[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 10ms/step


In [33]:
predictions_df = pd.DataFrame({
    "url":[data[1] for data in input_data],
    "predicted_label": [round(data[0]) for data in predicted_labels],
    "actual_label": [data[4] for data in input_data]
})

predictions_df.to_csv("./model/content_bigru_predictions.csv")