In [None]:
import numpy as np
import pandas as pd
import re
import string

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from gensim.models import KeyedVectors

!pip install pyvi
from pyvi import ViTokenizer



Defaulting to user installation because normal site-packages is not writeable


In [2]:
print("Đang load word embedding tiếng Việt...")
file_path = "cc.vi.300.vec" 
model_emb = KeyedVectors.load_word2vec_format(file_path, binary=False)

embedding_dim = model_emb.vector_size



Đang load word embedding tiếng Việt...


In [None]:
def get_vector(word):
    if word in model_emb.key_to_index:
        return model_emb[word]
    else:
        return np.zeros(embedding_dim, dtype=np.float32)

def preprocess_vi(sentence):
    s = sentence.lower().strip()
    # tokenize
    s = ViTokenizer.tokenize(s) 
    tokens = s.split()
    return tokens

def cosine_sim(a, b):
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if norm_a < 1e-9 or norm_b < 1e-9:
        return 0.0
    return float((a @ b) / (norm_a * norm_b))

def find_best_match(vec_s, list_vec_t):
    # Tìm t_j có sim cao nhất
    best_sim = -1.0
    best_vec = np.zeros_like(vec_s)
    for vec_t in list_vec_t:
        sim = cosine_sim(vec_s, vec_t)
        if sim > best_sim:
            best_sim = sim
            best_vec = vec_t
    return best_vec

def linear_decompose(s_i, s_i_hat):
    alpha = cosine_sim(s_i, s_i_hat)
    s_plus = alpha * s_i
    s_minus = (1 - alpha) * s_i
    return s_plus, s_minus


class SimilarityDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sent1 = str(row["sentence1"])
        sent2 = str(row["sentence2"])
        label = float(row["similarity"])  # range [0..1]
        return sent1, sent2, label

def collate_fn(batch):
    # match_len: fix 20 (toy)
    max_len = 20

    batch_similar = []
    batch_dissimilar = []
    batch_labels = []

    for sent1, sent2, label in batch:
        tokens_s = preprocess_vi(sent1)
        tokens_t = preprocess_vi(sent2)

        vecs_s = [get_vector(tok) for tok in tokens_s]
        vecs_t = [get_vector(tok) for tok in tokens_t]

        s_plus_list = []
        s_minus_list = []
        for s_i in vecs_s:
            s_i_hat = find_best_match(s_i, vecs_t)
            s_plus, s_minus = linear_decompose(s_i, s_i_hat)
            s_plus_list.append(s_plus)
            s_minus_list.append(s_minus)

        # Truncate/pad
        s_plus_list = s_plus_list[:max_len]
        s_minus_list = s_minus_list[:max_len]
        pad_len = max_len - len(s_plus_list)

        s_plus_list += [np.zeros(embedding_dim, dtype=np.float32)] * pad_len
        s_minus_list += [np.zeros(embedding_dim, dtype=np.float32)] * pad_len

        plus_array = np.stack(s_plus_list, axis=0)
        minus_array = np.stack(s_minus_list, axis=0)

        batch_similar.append(plus_array)
        batch_dissimilar.append(minus_array)
        batch_labels.append(label)

    sim_tensor = torch.tensor(batch_similar, dtype=torch.float)
    dis_tensor = torch.tensor(batch_dissimilar, dtype=torch.float)

    # Ghép 2 channel: (B, 2, max_len, emb_dim)
    input_tensor = torch.stack([sim_tensor, dis_tensor], dim=1)
    labels_tensor = torch.tensor(batch_labels, dtype=torch.float).view(-1,1)

    return input_tensor, labels_tensor


class TwoChannelCNN(nn.Module):
    def __init__(self, emb_dim, num_filters=64, kernel_size=3):
        super().__init__()
        self.conv = nn.Conv2d(
            in_channels=2,
            out_channels=num_filters,
            kernel_size=(kernel_size, kernel_size),
            padding=(1,1)
        )
        self.pool = nn.AdaptiveMaxPool2d((1,1))
        self.fc = nn.Linear(num_filters, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x shape: (B, 2, max_len, emb_dim)
        feat = self.conv(x)               # => (B, num_filters, ?, ?)
        feat = nn.functional.relu(feat)
        feat = self.pool(feat)            # => (B, num_filters, 1,1)
        feat = feat.squeeze(-1).squeeze(-1) # => (B, num_filters)
        out = self.fc(feat)               # => (B,1)
        out = self.sigmoid(out)           # => (B,1), range [0..1]
        return out

# 6) ĐỌC DỮ LIỆU & TRAIN

if __name__ == "__main__":
    # sentence1, sentence2, similarity
    # filtered_output.csv
    # normalized_converted_data.csv
    # data_main.csv
    df_all = pd.read_csv("data_train_main.csv", sep=";", decimal=",")


    # Shuffle + tách train/val
    df_shuf = df_all.sample(frac=1, random_state=42).reset_index(drop=True)
    train_size = int(0.8 * len(df_shuf))
    df_train = df_shuf[:train_size]
    df_val   = df_shuf[train_size:]

    train_ds = SimilarityDataset(df_train)
    val_ds   = SimilarityDataset(df_val)

    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader   = DataLoader(val_ds, batch_size=4, shuffle=False, collate_fn=collate_fn)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Training device:", device)

    model = TwoChannelCNN(emb_dim=embedding_dim, num_filters=64, kernel_size=3).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    def eval_loss(dloader):
        model.eval()
        losses = []
        with torch.no_grad():
            for xb, yb in dloader:
                xb = xb.to(device)
                yb = yb.to(device)
                pred = model(xb)
                loss = criterion(pred, yb)
                losses.append(loss.item())
        return np.mean(losses)

    num_epochs = 10  # tùy chọn
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        val_l = eval_loss(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"TrainLoss={np.mean(train_losses):.4f}, "
              f"ValLoss={val_l:.4f}")

    # Lưu trọng số
    torch.save(model.state_dict(), "model_weights_vi_6.pt")
    print("Đã train xong. Mô hình được lưu vào model_weights_vi.pt")

In [None]:
import pandas as pd
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
threshold = 0.7

def predict_similarity(model, sentence1, sentence2, max_len=20):
    # 1) Preprocess
    tokens_s = preprocess_vi(sentence1)
    tokens_t = preprocess_vi(sentence2)
    
    # 2) Embed
    vecs_s = [get_vector(w) for w in tokens_s]
    vecs_t = [get_vector(w) for w in tokens_t]
    
    # 3) Matching + Decomposition
    s_plus_list = []
    s_minus_list = []
    for s_i in vecs_s:
        s_i_hat = find_best_match(s_i, vecs_t)
        s_plus, s_minus = linear_decompose(s_i, s_i_hat)
        s_plus_list.append(s_plus)
        s_minus_list.append(s_minus)
    
    # 4) Truncate / pad cho đủ max_len
    s_plus_list  = s_plus_list[:max_len]
    s_minus_list = s_minus_list[:max_len]
    
    pad_len = max_len - len(s_plus_list)
    s_plus_list  += [np.zeros(embedding_dim, dtype=np.float32)] * pad_len
    s_minus_list += [np.zeros(embedding_dim, dtype=np.float32)] * pad_len
    
    # (seq_len, emb_dim) => stack => (max_len, emb_dim)
    plus_array  = np.stack(s_plus_list, axis=0)
    minus_array = np.stack(s_minus_list, axis=0)
    
    # 5) Ghép 2 kênh => shape (1,2,max_len,emb_dim)
    sim_tensor = torch.tensor([plus_array], dtype=torch.float)
    dis_tensor = torch.tensor([minus_array], dtype=torch.float)
    
    input_tensor = torch.stack([sim_tensor, dis_tensor], dim=1)
    
    # 6) Đưa vào model -> ra similarity
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
    # output shape (1,1)
    sim_score = float(output.item())
    return sim_score

def predict_labels(similarity_scores, threshold=0.5):
    return [1 if sim >= threshold else 0 for sim in similarity_scores]

def evaluate_model(model, ground_truth_file, threshold=0.5):
    data = pd.read_csv(ground_truth_file, delimiter=';', decimal=',')
    
    sentence1 = data['sentence1'].tolist()
    sentence2 = data['sentence2'].tolist()
    true_similarity = data['similarity'].tolist()

    predicted_similarity = []
    for sentA, sentB in zip(sentence1, sentence2):
        similarity_value = predict_similarity(model, sentA, sentB)
        predicted_similarity.append(similarity_value)

    true_labels = predict_labels(true_similarity, threshold)
    predicted_labels = predict_labels(predicted_similarity, threshold)

    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    return precision, recall, f1

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    model = TwoChannelCNN(emb_dim=embedding_dim, num_filters=64, kernel_size=3).to(device)
    model.load_state_dict(torch.load("model_weights_vi_5.pt", map_location=device))

    ground_truth_file = "normalized_converted_data.csv"  

    precision, recall, f1 = evaluate_model(model, ground_truth_file, threshold)

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Measure: {f1:.4f}")

    # threshold = 0.7
    # model_weights_vi Precision: 0.8135 Recall: 0.2836 F1-Measure: 0.4206
    # model_weights_vi_1 Precision: 0.7036 Recall: 0.7582 F1-Measure: 0.7299
    # model_weights_vi_2 Precision: 0.7317 Recall: 0.6526 F1-Measure: 0.6899
    # model_weights_vi_3 Precision: 0.6122 Recall: 0.9713 F1-Measure: 0.7510
    # model_weights_vi_4 Precision: 0.6845 Recall: 0.8729 F1-Measure: 0.7673
    # model_weights_vi_5 Precision: 0.6882 Recall: 0.8800 F1-Measure: 0.7724
    # model_weights_vi_6 Precision: 0.7903 Recall: 0.4824 F1-Measure: 0.5991
    # model_weights_vi_7 Precision: 0.7218 Recall: 0.7611 F1-Measure: 0.7409
    # model_weights_vi_5k Precision: 0.6996 Recall: 0.5189 F1-Measure: 0.5959


    
