# Import libraries


In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Flatten, concatenate, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dot
from keras.callbacks import ReduceLROnPlateau
from keras.optimizers import RMSprop

from gensim.models import KeyedVectors

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

import unicodedata
import os
import string
from zipfile import ZipFile
import urllib.request

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Preparing data
Download data from Kaggle

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {"username":"thnhnguyenphuc","key":"c4567d6a802976fa686fb1f293c5ce5f"}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi

# https://technowhisp.com/kaggle-api-python-documentation/
# Authenticate to Kaggle
api = KaggleApi()
api.authenticate()

# Download the dataset
api.dataset_download_files('duyminhnguyentran/csc15105', unzip=True)

Path to data file (change if necessary)

In [None]:
file_path = "Project1_Data.json"

with open(file_path, "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)

df.head()

Unnamed: 0,id,question,title,text,label
0,u7-1570446247_1,Quang Hải giành được chức vô địch U21 quốc gia...,Nguyễn Quang Hải (sinh 1997),"Năm 2013 , Nguyễn Quang Hải giành chức vô địch...",True
1,u7-1570446247_2,Quang Hải giành được chức vô địch U21 quốc gia...,Nguyễn Quang Hải (sinh 1997),"Sau chức vô địch U-21 quốc gia 2013 , Nguyễn Q...",True
2,u7-1570446247_0,Quang Hải giành được chức vô địch U21 quốc gia...,Nguyễn Quang Hải (sinh 1997),Anh bắt đầu gia nhập lò đào tạo trẻ Hà Nội T&T...,False
3,u7-1570446247_3,Quang Hải giành được chức vô địch U21 quốc gia...,Nguyễn Quang Hải (sinh 1997),"Năm 2011 , Nguyễn Quang Hải mới 14 tuổi được g...",False
4,u7-1570445661_0,Mỗi hiệp bóng đá kéo dài bao lâu,Bóng đá,Một trận đấu bóng đá thông thường có hai hiệp ...,True


Because NLTK does not have a stopword library for vietnamese so I will download the stopword library from this github repo 'https://github.com/stopwords/vietnamese-stopwords/' and store it like an nltk file

In [None]:
nltk_stopwords_dir = "/root/nltk_data/corpora/stopwords"
stopwords_url = "https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords.txt"


# Function to download file from URL
def download_file(url, save_path):
    try:
        urllib.request.urlretrieve(url, save_path)
        print("File downloaded successfully.")
    except Exception as e:
        print("Error downloading file:", e)

def convert_to_nltk_stopwords(txt_file_path, nltk_stopwords_path):
    try:
        with open(txt_file_path, 'r', encoding='utf-8') as f:
            words = [line.strip() for line in f if line.strip()]

        with open(nltk_stopwords_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(words))

        print("Stop words file converted and saved successfully.")
    except Exception as e:
        print("Error converting and saving stop words file:", e)

# Download stop words txt file
stopwords_file_path = os.path.join(nltk_stopwords_dir, "vietnamese-stopwords.txt")
download_file(stopwords_url, stopwords_file_path)

nltk_stopwords_path = os.path.join(nltk_stopwords_dir, "vietnamese")
convert_to_nltk_stopwords(stopwords_file_path, nltk_stopwords_path)


File downloaded successfully.
Stop words file converted and saved successfully.


Remove stopwords and punctuation, and convert to lowercase

In [None]:
stop_words = set(stopwords.words('vietnamese'))

def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.lower() not in stop_words and word not in string.punctuation]
    return ' '.join(words)

preprocessed_data = []
for entry in data:
    preprocessed_entry = {
        'id': entry['id'],
        'question': preprocess_text(entry['question']),
        'title': preprocess_text(entry['title']),
        'text': preprocess_text(entry['text']),
        'label': entry['label']
    }
    preprocessed_data.append(preprocessed_entry)

df = pd.DataFrame(preprocessed_data)

df.head()

Unnamed: 0,id,question,title,text,label
0,u7-1570446247_1,quang hải giành chức vô địch u21 quốc gia bao ...,nguyễn quang hải sinh 1997,2013 nguyễn quang hải giành chức vô địch u21 q...,True
1,u7-1570446247_2,quang hải giành chức vô địch u21 quốc gia bao ...,nguyễn quang hải sinh 1997,chức vô địch u-21 quốc gia 2013 nguyễn quang h...,True
2,u7-1570446247_0,quang hải giành chức vô địch u21 quốc gia bao ...,nguyễn quang hải sinh 1997,bắt đầu gia nhập lò đào trẻ hà nội t t 9 2006,False
3,u7-1570446247_3,quang hải giành chức vô địch u21 quốc gia bao ...,nguyễn quang hải sinh 1997,2011 nguyễn quang hải 14 gọi đội tuyển u-16 vi...,False
4,u7-1570445661_0,hiệp bóng đá kéo bao,bóng đá,trận đấu bóng đá thông hai hiệp hiệp 45 phút t...,True


Split the data into a train, validation and test file in the ratio of 0.7, 0.15, 0.15

In [None]:
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

train, test = train_test_split(df, test_size=1 - train_ratio, random_state=42)
val, test = train_test_split(test, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train['label'])
val_labels = label_encoder.transform(val['label'])
test_labels = label_encoder.transform(test['label'])

# Perform Word Embedding using Word2Vec

Download pretrained Vietnamese Word2Vec model

In [None]:
!wget https://thiaisotajppub.s3-ap-northeast-1.amazonaws.com/publicfiles/wiki.vi.model.bin.gz
!gzip -d wiki.vi.model.bin.gz

--2024-03-06 16:08:41--  https://thiaisotajppub.s3-ap-northeast-1.amazonaws.com/publicfiles/wiki.vi.model.bin.gz
Resolving thiaisotajppub.s3-ap-northeast-1.amazonaws.com (thiaisotajppub.s3-ap-northeast-1.amazonaws.com)... 3.5.158.194, 52.219.16.59, 52.219.8.43, ...
Connecting to thiaisotajppub.s3-ap-northeast-1.amazonaws.com (thiaisotajppub.s3-ap-northeast-1.amazonaws.com)|3.5.158.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 345967494 (330M) [application/x-gzip]
Saving to: ‘wiki.vi.model.bin.gz’


2024-03-06 16:09:03 (16.0 MB/s) - ‘wiki.vi.model.bin.gz’ saved [345967494/345967494]



In [None]:
# Load Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('wiki.vi.model.bin', binary=True)

# Tokenize and pad sequences
max_sequence_length = 100

def preprocess_text(text):
    tokens = text.split()
    tokens = [token for token in tokens if token in word2vec_model]
    return tokens

train_text_sequences = train['text'].apply(preprocess_text)
val_text_sequences = val['text'].apply(preprocess_text)
test_text_sequences = test['text'].apply(preprocess_text)

train_question_sequences = train['question'].apply(preprocess_text)
val_question_sequences = val['question'].apply(preprocess_text)
test_question_sequences = test['question'].apply(preprocess_text)

# Tokenize text sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_sequences + train_question_sequences)

train_text_sequences = tokenizer.texts_to_sequences(train_text_sequences)
val_text_sequences = tokenizer.texts_to_sequences(val_text_sequences)
test_text_sequences = tokenizer.texts_to_sequences(test_text_sequences)

train_question_sequences = tokenizer.texts_to_sequences(train_question_sequences)
val_question_sequences = tokenizer.texts_to_sequences(val_question_sequences)
test_question_sequences = tokenizer.texts_to_sequences(test_question_sequences)

# Pad sequences
train_text_data = pad_sequences(train_text_sequences, maxlen=max_sequence_length)
val_text_data = pad_sequences(val_text_sequences, maxlen=max_sequence_length)
test_text_data = pad_sequences(test_text_sequences, maxlen=max_sequence_length)

train_question_data = pad_sequences(train_question_sequences, maxlen=max_sequence_length)
val_question_data = pad_sequences(val_question_sequences, maxlen=max_sequence_length)
test_question_data = pad_sequences(test_question_sequences, maxlen=max_sequence_length)

# Create embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]


# Build the classification model

In [None]:
# Define input layers
text_input = Input(shape=(max_sequence_length,), name='text_input')
question_input = Input(shape=(max_sequence_length,), name='question_input')

embedding_dim = word2vec_model.vector_size

# Define embedding layers
text_embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                                 input_length=max_sequence_length, trainable=False)(text_input)
question_embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                                     input_length=max_sequence_length, trainable=False)(question_input)

# Define Flatten layers
text_flatten_layer1 = Flatten()(text_embedding_layer)
question_flatten_layer1 = Flatten()(question_embedding_layer)

# Define Dense layers
text_dense_layer1 = Dense(embedding_dim, input_shape=(max_sequence_length,), activation='relu')(text_flatten_layer1)
text_dense_layer2 = Dense(512, activation='relu')(text_dense_layer1)
text_dense_layer3 = Dense(256, activation='relu')(text_dense_layer2)
text_dense_layer4 = Dense(128, activation='relu')(text_dense_layer3)

question_dense_layer1 = Dense(embedding_dim, input_shape=(max_sequence_length,), activation='relu')(question_flatten_layer1)
question_dense_layer2 = Dense(512, activation='relu')(question_dense_layer1)
question_dense_layer3 = Dense(256, activation='relu')(question_dense_layer2)
question_dense_layer4 = Dense(128, activation='relu')(question_dense_layer3)

dot_layer = Dot(axes=1, normalize=True)([text_dense_layer4, question_dense_layer4])

# Fully connected layers
output_layer = Dense(1, activation='relu')(dot_layer)

# Define model
model = Model(inputs=[text_input, question_input], outputs=output_layer)

# Compile model
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.0001)
optimizer = RMSprop(learning_rate=0.0001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit({'text_input': train_text_data, 'question_input': train_question_data},
                    train_labels,
                    epochs=10,
                    batch_size=32,
                    validation_data=({'text_input': val_text_data, 'question_input': val_question_data}, val_labels),
                    callbacks=[early_stopping])

# Evaluate the model on test data
loss, accuracy = model.evaluate({'text_input': test_text_data, 'question_input': test_question_data}, test_labels)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Test Loss: 0.6007127165794373
Test Accuracy: 0.6952521204948425
