In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer

# Load and preprocess data
def parse_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(',')
            if len(parts) > 2:
                theme = parts[0].split('#')
                emotions = parts[1].split('#')
                poetry = ','.join(parts[2:]).strip()
                data.append({"theme": theme, "emotions": emotions, "poetry": poetry})
    return pd.DataFrame(data)

# Specify file path
file_path = 'C:\\Users\\86152\\Desktop\\Classical Chinese poetry_with_labels.txt'
data = parse_file(file_path)

# Extract unique themes and emotions
themes = set(theme for sublist in data['theme'] for theme in sublist)
emotions = set(emotion for sublist in data['emotions'] for emotion in sublist)

# Initialize Multi-Label Binarizers
theme_binarizer = MultiLabelBinarizer().fit(data['theme'])
emotion_binarizer = MultiLabelBinarizer().fit(data['emotions'])


In [2]:
class PoetryDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, theme_binarizer, emotion_binarizer):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.theme_binarizer = theme_binarizer
        self.emotion_binarizer = emotion_binarizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        poetry = self.data.iloc[index]['poetry']
        themes = self.theme_binarizer.transform([self.data.iloc[index]['theme']])[0]
        emotions = self.emotion_binarizer.transform([self.data.iloc[index]['emotions']])[0]

        encoding = self.tokenizer(
            poetry,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'theme_labels': torch.tensor(themes, dtype=torch.float),
            'emotion_labels': torch.tensor(emotions, dtype=torch.float),
        }


In [3]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("C:\\Users\\86152\\.cache\\huggingface\\hub\\models--SIKU-BERT--sikubert\\snapshots\\fc656de2d6bde33919102dd3abe31c843f42226a")

# Set parameters
MAX_LEN = 128
BATCH_SIZE = 16

# Create Dataset and DataLoader
train_dataset = PoetryDataset(data, tokenizer, MAX_LEN, theme_binarizer, emotion_binarizer)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)


In [4]:
class MultiLabelBERT(torch.nn.Module):
    def __init__(self, model_name, num_theme_labels, num_emotion_labels):
        super(MultiLabelBERT, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=0)  # Base BERT without predefined labels
        self.dropout = torch.nn.Dropout(0.3)
        self.theme_classifier = torch.nn.Linear(self.bert.config.hidden_size, num_theme_labels)
        self.emotion_classifier = torch.nn.Linear(self.bert.config.hidden_size, num_emotion_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs[1])  # CLS token

        theme_logits = self.theme_classifier(pooled_output)
        emotion_logits = self.emotion_classifier(pooled_output)
        return theme_logits, emotion_logits


In [5]:
def train(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        theme_labels = batch['theme_labels'].to(device)
        emotion_labels = batch['emotion_labels'].to(device)

        optimizer.zero_grad()
        theme_logits, emotion_logits = model(input_ids, attention_mask)

        theme_loss = loss_fn(theme_logits, theme_labels)
        emotion_loss = loss_fn(emotion_logits, emotion_labels)
        loss = theme_loss + emotion_loss

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)


In [6]:
from torch.optim import AdamW

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLabelBERT("C:\\Users\\86152\\.cache\\huggingface\\hub\\models--SIKU-BERT--sikubert\\snapshots\\fc656de2d6bde33919102dd3abe31c843f42226a", len(theme_binarizer.classes_), len(emotion_binarizer.classes_))
model.to(device)

# Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()

# Training Loop
EPOCHS = 5
for epoch in range(EPOCHS):
    avg_loss = train(model, train_dataloader, optimizer, loss_fn, device)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {avg_loss:.4f}")


  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at C:\Users\86152\.cache\huggingface\hub\models--SIKU-BERT--sikubert\snapshots\fc656de2d6bde33919102dd3abe31c843f42226a and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.3645
Epoch 2/5, Loss: 0.2108
Epoch 3/5, Loss: 0.1552
Epoch 4/5, Loss: 0.1167
Epoch 5/5, Loss: 0.0900


In [13]:
def predict(model, tokenizer, text, theme_binarizer, emotion_binarizer, device):
    # Tokenize and move inputs to the same device as the model
    encoding = tokenizer(
        text,
        return_tensors="pt",
        max_length=MAX_LEN,
        padding="max_length",
        truncation=True
    ).to(device)

    model.eval()
    with torch.no_grad():
        theme_logits, emotion_logits = model(
            encoding['input_ids'],
            encoding['attention_mask']
        )
        themes = (torch.sigmoid(theme_logits) > 0.5).cpu().numpy()
        emotions = (torch.sigmoid(emotion_logits) > 0.5).cpu().numpy()

    return theme_binarizer.inverse_transform(themes), emotion_binarizer.inverse_transform(emotions)


In [14]:
# Example usage
text = "送胡昌世比部转饷辽阳便道归五岭,夜夜旄头照海天，辽阳杀气镇相缠。枕戈谁念三军苦，转饷俄腾万灶烟。塞雪近关看破腊，岭梅迎马报新年。相如拥传多光彩，岂羡乘槎到日边。。"
themes, emotions = predict(model, tokenizer, text, theme_binarizer, emotion_binarizer, device)
print(f"Themes: {themes}")
print(f"Emotions: {emotions}")

Themes: [('战争',)]
Emotions: [('想家',)]


In [9]:
sen1 = '红梅映雪千家瑞'
sen2 = '赤县迎春百业新'
themes1, emotions1 = predict(model, tokenizer, sen1, theme_binarizer, emotion_binarizer, device)
themes2, emotions2 = predict(model, tokenizer, sen2, theme_binarizer, emotion_binarizer, device)
print(f"Sen1: {themes1}&{emotions1}")
print(f"Sen2: {themes2}&{emotions2}")

Sen1: [('咏物',)]&[()]
Sen2: [()]&[()]


In [15]:
def predict(model, tokenizer, text, theme_binarizer, emotion_binarizer, device):
    # Tokenize and move inputs to the same device as the model
    encoding = tokenizer(
        text,
        return_tensors="pt",
        max_length=MAX_LEN,
        padding="max_length",
        truncation=True
    ).to(device)

    model.eval()
    with torch.no_grad():
        theme_logits, emotion_logits = model(
            encoding['input_ids'],
            encoding['attention_mask']
        )
        # Select the class with the largest probability
        theme_index = torch.argmax(theme_logits, dim=-1).cpu().numpy()
        emotion_index = torch.argmax(emotion_logits, dim=-1).cpu().numpy()

    # Map indices back to labels using binarizer's classes
    themes = [theme_binarizer.classes_[idx] for idx in theme_index]
    emotions = [emotion_binarizer.classes_[idx] for idx in emotion_index]

    return themes, emotions


In [17]:
sen_in = ['红梅映雪千家瑞','花明柳媚春光好','水无两点不成冰','庄生梦蝶知无我','秋风送爽花正艳','黄河东去流不息','浪遏飞舟留客住','黄莺日日盼新岁']
sen_out = ['赤县迎春百业新','大江南北庆丰收','王不出头谁是主','晏子分桃为有他','雨打屋檐人未归','庭前落叶扫无痕','风吹垂柳赋情来','绿柳枝枝辞旧年']
theme_in = []
theme_out = []
emotion_in = []
emotion_out = []

In [18]:
for i in range(len(sen_in)):
    sen1 = sen_in[i]
    sen2 = sen_out[i]
    theme1, emotion1 = predict(model, tokenizer, sen1, theme_binarizer, emotion_binarizer, device)
    theme2, emotion2 = predict(model, tokenizer, sen2, theme_binarizer, emotion_binarizer, device)
    theme_in.append(theme1)
    emotion_in.append(emotion1)
    theme_out.append(theme2)
    emotion_out.append(emotion2)

In [19]:
import pandas as pd
data = {'in': sen_in, 'out': sen_out, 'theme_in': theme_in, 'theme_out': theme_out, 'emotion_in':emotion_in, 'emotion_out':emotion_out}
df = pd.DataFrame(data)

In [20]:
df

Unnamed: 0,in,out,theme_in,theme_out,emotion_in,emotion_out
0,红梅映雪千家瑞,赤县迎春百业新,[咏物],[怀古],[想家],[喜悦]
1,花明柳媚春光好,大江南北庆丰收,[咏物],[思乡],[想家],[喜悦]
2,水无两点不成冰,王不出头谁是主,[咏物],[战争],[喜悦],[喜悦]
3,庄生梦蝶知无我,晏子分桃为有他,[怀古],[怀古],[喜悦],[失意]
4,秋风送爽花正艳,雨打屋檐人未归,[送别],[田园],[想家],[想家]
5,黄河东去流不息,庭前落叶扫无痕,[思乡],[咏物],[想家],[喜悦]
6,浪遏飞舟留客住,风吹垂柳赋情来,[咏物],[咏物],[喜悦],[喜悦]
7,黄莺日日盼新岁,绿柳枝枝辞旧年,[咏物],[咏物],[喜悦],[想家]


In [21]:
import os
desktop = os.path.join(os.path.expanduser("~"), "Desktop")
file_path = os.path.join(desktop, "theme_emotion.csv")

# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)

In [11]:
sen1 = '红梅映雪千家瑞'
sen2 = '赤县迎春百业新'
themes1, emotions1 = predict(model, tokenizer, sen1, theme_binarizer, emotion_binarizer, device)
themes2, emotions2 = predict(model, tokenizer, sen2, theme_binarizer, emotion_binarizer, device)
print(f"Sen1: {themes1}&{emotions1}")
print(f"Sen2: {themes2}&{emotions2}")

Sen1: ['咏物']&['想家']
Sen2: ['怀古']&['喜悦']


In [12]:
sen3 = '黄河东去流不息'
sen4 = '庭前落叶扫无痕'
themes3, emotions3 = predict(model, tokenizer, sen3, theme_binarizer, emotion_binarizer, device)
themes4, emotions4 = predict(model, tokenizer, sen4, theme_binarizer, emotion_binarizer, device)
print(f"Sen1: {themes3}&{emotions3}")
print(f"Sen2: {themes4}&{emotions4}")

Sen1: ['思乡']&['想家']
Sen2: ['咏物']&['喜悦']
