In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt


In [4]:
# Training
# Load the dataset
data = pd.read_csv("matched_data.csv")  # Replace with your actual file path

# Preprocess the data
class PoemDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        # Pad labels to match max_len
        label = torch.tensor(
            label + [0] * (self.max_len - len(label)), dtype=torch.long
        )

        return input_ids, attention_mask, label


# Prepare data
MAX_LEN = 128  # Adjust as needed
texts = data["Column1"].tolist()
labels = data["Column1_Tag"].apply(lambda x: [int(char) for char in str(x)]).tolist()

# Split data into train and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.1, random_state=42
)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("SIKU-BERT/sikubert")
sikubert = AutoModel.from_pretrained("SIKU-BERT/sikubert")

train_dataset = PoemDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = PoemDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Define the model
class WordSegmentationModel(nn.Module):
    def __init__(self, sikubert):
        super(WordSegmentationModel, self).__init__()
        self.sikubert = sikubert
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(sikubert.config.hidden_size, 2)  # Binary classification

    def forward(self, input_ids, attention_mask):
        outputs = self.sikubert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        return logits


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = WordSegmentationModel(sikubert).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for input_ids, attention_mask, labels in tqdm(train_loader):
            input_ids, attention_mask, labels = (
                input_ids.to(device),
                attention_mask.to(device),
                labels.to(device),
            )
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs.view(-1, 2), labels.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        print(f"Epoch {epoch + 1}, Train Loss: {train_loss / len(train_loader)}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for input_ids, attention_mask, labels in val_loader:
                input_ids, attention_mask, labels = (
                    input_ids.to(device),
                    attention_mask.to(device),
                    labels.to(device),
                )
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs.view(-1, 2), labels.view(-1))
                val_loss += loss.item()

        print(f"Epoch {epoch + 1}, Val Loss: {val_loss / len(val_loader)}")


# Train the model
# train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5)
# Save the model
# torch.save(model, "word_segmentation_model_full.pth")

In [5]:
# Load the saved model
model = torch.load("word_segmentation_model_full.pth")
model.eval()  # Set to evaluation mode
print("Model loaded successfully!")


  model = torch.load("word_segmentation_model_full.pth")


Model loaded successfully!


In [4]:
# Test

# Define the testing function
def test_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(test_loader):
            input_ids, attention_mask, labels = (
                input_ids.to(device),
                attention_mask.to(device),
                labels.to(device),
            )

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=2)  # Get the class with the highest score

            # Flatten predictions and labels for evaluation
            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

    return all_preds, all_labels


# Prepare the test dataset and loader
test_texts = data["Column2"].tolist()  # Replace with actual test texts
test_labels = data["Column2_Tag"].apply(lambda x: [int(char) for char in str(x)]).tolist()

test_dataset = PoemDataset(test_texts, test_labels, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=32)

# # Run the testing function
# preds, labels = test_model(model, test_loader)

# # Filter out padding tokens (assume label -1 is used for padding during training)
# valid_preds = [p for p, l in zip(preds, labels) if l != -1]
# valid_labels = [l for l in labels if l != -1]

# # Evaluate with classification metrics
# print(classification_report(valid_labels, valid_preds, target_names=["Continuation", "Beginning"]))


In [6]:
# Define the function to predict tags
def predict_tags_batch(texts, model, tokenizer, max_len=128, batch_size=32):
    """
    Predict 01 labels for multiple Chinese texts in batch mode.
    
    Args:
        texts (list): List of input Chinese texts.
        model: Trained word segmentation model.
        tokenizer: SikuBERT tokenizer.
        max_len (int): Maximum sequence length for padding/truncation.
        batch_size (int): Number of texts to process in one batch.
    
    Returns:
        list of list of int: Predicted tags (01 labels) for each text.
    """
    model.eval()
    
    # Tokenize all texts
    encodings = tokenizer(
        texts,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    input_ids = encodings["input_ids"]
    attention_mask = encodings["attention_mask"]
    
    # Split data into batches
    num_batches = (len(texts) + batch_size - 1) // batch_size
    predicted_tags = []

    with torch.no_grad():
        for i in range(num_batches):
            # Get batch data
            batch_input_ids = input_ids[i * batch_size : (i + 1) * batch_size].to(device)
            batch_attention_mask = attention_mask[i * batch_size : (i + 1) * batch_size].to(device)
            
            # Predict
            outputs = model(batch_input_ids, batch_attention_mask)
            batch_predictions = torch.argmax(outputs, dim=2).cpu().numpy()
            
            # Convert predictions to list of lists (remove padding)
            for j, prediction in enumerate(batch_predictions):
                text_length = len(texts[i * batch_size + j])
                pred = prediction[:text_length].tolist()

                if pred[0] == 0:
                    pred[0] = 1
                predicted_tags.append(pred)
                
    return predicted_tags


In [7]:
# Appllication of the word segmentation model and cosine similarity comparison 

# load the selected raw couplets texts
data = pd.read_csv('filtered_survey_results.csv')
text = data.columns[3:].to_list()

# Predict the word segmentation tags
upper_half_text = []
lower_half_text = []
for i,string in enumerate(text):
    if len(string.split('，')) == 2:
        upper_half_text.append(string.split('，')[0])
        lower_half_text.append(string.split('，')[1])
    else:
        upper_half_text.append(string.split(' ')[0])
        lower_half_text.append(string.split(' ')[1])
survey_df = pd.DataFrame({'Column1': upper_half_text, 
                          'Column2': lower_half_text})
upper_half = predict_tags_batch(upper_half_text, model, tokenizer, batch_size=32)
lower_half = predict_tags_batch(lower_half_text, model, tokenizer, batch_size=32)
survey_df['Column1_Tag'] = upper_half
survey_df['Column2_Tag'] = lower_half

# Compute cosine similarity
def cosine_similarity(row):
    array1 = np.array(row["Column1_Tag"])
    array2 = np.array(row["Column2_Tag"])
    return np.dot(array1, array2) / (np.linalg.norm(array1) * np.linalg.norm(array2))
survey_df["cosine_similarity"] = survey_df.apply(cosine_similarity, axis=1)

survey_df.to_csv('word_segment.csv')