# 数据导入

In [76]:
import pandas as pd

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
solution = pd.read_csv('./data/solution.csv')

# 数据预处理



### 删去中文列

In [77]:
del train['title1_zh']
del train['title2_zh']

train.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label
0,0,0,1,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,3,2,3,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
2,1,2,4,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
3,2,2,5,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP topped Hong Kong last year? She...,unrelated
4,9,6,7,"""How to discriminate oil from gutter oil by me...",It took 30 years of cooking oil to know that o...,agreed


In [78]:

del test['title1_zh']
del test['title2_zh']

test.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en
0,321187,167562,59521,egypt 's presidential election failed to win m...,Lyon! Lyon officials have denied that Felipe F...
1,321190,167564,91315,A message from Saddam Hussein after he was cap...,The Top 10 Americans believe that the Lizard M...
2,321189,167563,167564,Will the United States wage war on Iraq withou...,A message from Saddam Hussein after he was cap...
3,321193,167564,160994,A message from Saddam Hussein after he was cap...,The hanging Saddam is a surrogate? This man's ...
4,321191,167564,15084,A message from Saddam Hussein after he was cap...,Chinese loquat loquat plaster in America? Pure...


### 合并测试集

In [79]:
solution.rename(columns={'Id': 'id'}, inplace=True)
solution.rename(columns={'Expected': 'label_encoded'}, inplace=True)

test_merged = pd.merge(test, solution, on='id')

test_merged.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label_encoded,Weight,Usage
0,321187,167562,59521,egypt 's presidential election failed to win m...,Lyon! Lyon officials have denied that Felipe F...,unrelated,0.0625,Private
1,321190,167564,91315,A message from Saddam Hussein after he was cap...,The Top 10 Americans believe that the Lizard M...,unrelated,0.0625,Public
2,321189,167563,167564,Will the United States wage war on Iraq withou...,A message from Saddam Hussein after he was cap...,unrelated,0.0625,Private
3,321193,167564,160994,A message from Saddam Hussein after he was cap...,The hanging Saddam is a surrogate? This man's ...,unrelated,0.0625,Public
4,321191,167564,15084,A message from Saddam Hussein after he was cap...,Chinese loquat loquat plaster in America? Pure...,unrelated,0.0625,Public


### 文本清理

1. 特殊符号

In [80]:
# 若字符串中全是特殊符号，则删除该行

import re

def is_special(s, threshold=0.4):
    non_alnum_chars = re.findall(r'[^a-zA-Z0-9\s]', s)
    non_alnum_ratio = len(non_alnum_chars) / len(s)
    
    return non_alnum_ratio > threshold

special_1 = train['title1_en'].apply(is_special)
special_2 = train['title2_en'].apply(is_special)
train = train[~special_1 & ~special_2]

special_1 = test_merged['title1_en'].apply(is_special)
special_2 = test_merged['title2_en'].apply(is_special)
test_merged = test_merged[~special_1 & ~special_2]

2. 重复值

In [81]:
# 若字符串中有重复10次以上的单词，词组，或是长串连续字符则删除该行

def is_repeated(s, min_repeats=6):
    char_pattern = r'(.)\1{' + str(min_repeats - 1) + ',}'
    phase_pattern = r'\b(\w+\s?\w*)\b(?:\W+\1\b){' + str(min_repeats - 1) + ',}'
    word_pattern = r'\b(\w+)\b(?:.*?\b\1\b){' + str(min_repeats - 1) + ',}'
    return bool(re.search(r'\b(\w+\s?\w*)\b(?:\W+\1\b){' + str(min_repeats) + ',}', s)) or bool(re.search(char_pattern, s)) or bool(re.search(phase_pattern, s)) or bool(re.search(word_pattern, s))

repeated_1 = train['title1_en'].apply(is_repeated)
repeated_2 = train['title2_en'].apply(is_repeated)
train = train[~repeated_1 & ~repeated_2]

repeated_1 = test_merged['title1_en'].apply(is_repeated)
repeated_2 = test_merged['title2_en'].apply(is_repeated)
test_merged = test_merged[~repeated_1 & ~repeated_2]

3. UNK

In [82]:
# 发现数据中有个别行中存在大量“UNK”，大概是由于使用模型翻译时词典中找不到适配的词汇导致的
# 为了防止这些样本对模型训练产生干扰，我们可以将这些样本所在行删除
# 样本中出现多于8个“UNK”的行将被删除

mask_1 = train['title1_en'].apply(lambda x: x.lower().split().count('unk') > 8)
mask_2 = train['title2_en'].apply(lambda x: x.lower().split().count('unk') > 8)
train = train[~mask_1 & ~mask_2]

mask_1 = test_merged['title1_en'].apply(lambda x: x.lower().split().count('unk') > 8)
mask_2 = test_merged['title2_en'].apply(lambda x: x.lower().split().count('unk') > 8)
test_merged = test_merged[~mask_1 & ~mask_2]

4. 去除标点符号

In [83]:
def remove_punctuation(x):
    x = re.sub(r'[^\w\s]','',x)
    return x

5. 转成小写

In [84]:
def to_lowercase(x):
    return x.lower()

6. 去除停用词

In [93]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

7. 去除多余空格

In [86]:
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

8. 整合步骤并应用（步骤4，5，6，7）

In [87]:
def clean_text(text):
    text = remove_punctuation(text)
    text = to_lowercase(text)
    text = remove_stopwords(text)
    text = remove_extra_spaces(text)
    return text

train['title1_en'] = train['title1_en'].apply(clean_text)
train['title2_en'] = train['title2_en'].apply(clean_text)

train.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label
0,0,0,1,two new oldage insurance benefits old people r...,police disprove birds nest congress person get...,unrelated
1,3,2,3,come shenzhen sooner later son also come less ...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated
3,2,2,5,come shenzhen sooner later son also come less ...,shenzhens gdp topped hong kong last year shenz...,unrelated
4,9,6,7,discriminate oil gutter oil means garlic,took 30 years cooking oil know one piece garli...,agreed
5,4,2,8,come shenzhen sooner later son also come less ...,shenzhens gdp overtakes hong kong bureau stati...,unrelated


In [88]:
test_merged['title1_en'] = test_merged['title1_en'].apply(clean_text)
test_merged['title2_en'] = test_merged['title2_en'].apply(clean_text)

test_merged.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label_encoded,Weight,Usage
0,321187,167562,59521,egypt presidential election failed win million...,lyon lyon officials denied felipe federico joi...,unrelated,0.0625,Private
1,321190,167564,91315,message saddam hussein captured,top 10 americans believe lizard man controls u...,unrelated,0.0625,Public
2,321189,167563,167564,united states wage war iraq without destructio...,message saddam hussein captured,unrelated,0.0625,Private
3,321193,167564,160994,message saddam hussein captured,hanging saddam surrogate mans move destroy dou...,unrelated,0.0625,Public
4,321191,167564,15084,message saddam hussein captured,chinese loquat loquat plaster america pure rumor,unrelated,0.0625,Public


### 标签编码

In [89]:
from sklearn.preprocessing import LabelEncoder

'''
agreed = 0
disagreed = 1
unrelated = 2
'''

label_encoder = LabelEncoder()

train['label_encoded'] = label_encoder.fit_transform(train['label'])

train.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label,label_encoded
0,0,0,1,two new oldage insurance benefits old people r...,police disprove birds nest congress person get...,unrelated,2
1,3,2,3,come shenzhen sooner later son also come less ...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated,2
3,2,2,5,come shenzhen sooner later son also come less ...,shenzhens gdp topped hong kong last year shenz...,unrelated,2
4,9,6,7,discriminate oil gutter oil means garlic,took 30 years cooking oil know one piece garli...,agreed,0
5,4,2,8,come shenzhen sooner later son also come less ...,shenzhens gdp overtakes hong kong bureau stati...,unrelated,2


In [90]:
solution['label_encoded'] = label_encoder.fit_transform(solution['label_encoded'])

solution.head()

Unnamed: 0,id,label_encoded,Weight,Usage
0,347448,2,0.0625,Public
1,347449,2,0.0625,Private
2,359100,2,0.0625,Public
3,359101,2,0.0625,Private
4,359102,2,0.0625,Private


In [91]:
test_merged['label_encoded'] = label_encoder.fit_transform(test_merged['label_encoded'])

test_merged.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label_encoded,Weight,Usage
0,321187,167562,59521,egypt presidential election failed win million...,lyon lyon officials denied felipe federico joi...,2,0.0625,Private
1,321190,167564,91315,message saddam hussein captured,top 10 americans believe lizard man controls u...,2,0.0625,Public
2,321189,167563,167564,united states wage war iraq without destructio...,message saddam hussein captured,2,0.0625,Private
3,321193,167564,160994,message saddam hussein captured,hanging saddam surrogate mans move destroy dou...,2,0.0625,Public
4,321191,167564,15084,message saddam hussein captured,chinese loquat loquat plaster america pure rumor,2,0.0625,Public


### 已预处理数据导出

In [92]:
train.to_csv('./data/train_cleaned.csv', index=False)
test_merged.to_csv('./data/test_cleaned.csv', index=False)

### 已预处理数据导入

In [60]:
import pandas as pd

train_total = pd.read_csv('./data/train_cleaned.csv')
test_merged_total = pd.read_csv('./data/test_cleaned.csv')

### 随机抽样缩小原数据集

In [61]:
train = train_total.sample(frac=0.1, random_state=42).reset_index(drop=True)
test_merged = test_merged_total.sample(frac=0.1, random_state=42).reset_index(drop=True)

# 文本向量化

### 加载数据集（字典化）

In [62]:
from torch.utils.data import Dataset

class AFQMC(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        data_file = data_file.reset_index(drop=True)
        Data = data_file.to_dict(orient='index')
        return Data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if idx not in self.data:
            raise KeyError(f"Key {idx} not found in dataset")
        return self.data[idx]

train_dict = AFQMC(train)
test_merged_dict = AFQMC(test_merged)

print(train_dict[0])
print(test_merged_dict[0])

{'id': 275485, 'tid1': 153996, 'tid2': 154013, 'title1_en': 'longer white hair grows dont dye wash washing make hair look black shiny 3 days', 'title2_en': 'yaos exhusband ling soosus high profile comeback revealed reasons divorce', 'label': 'unrelated', 'label_encoded': 2}
{'id': 379653, 'tid1': 186512, 'tid2': 186513, 'title1_en': 'ship hit meteor rock crashed another world astronauts kept pets aliens', 'title2_en': 'alien ship crashed ancient vikings fell knees thought god coming', 'label_encoded': 2, 'Weight': 0.0625, 'Usage': 'Private'}


In [63]:
# 如果数据集非常巨大，难以一次性加载到内存中，我们也可以继承 IterableDataset 类构建迭代型数据集

# from torch.utils.data import IterableDataset
# import json

# class IterableAFQMC(IterableDataset):
#     def __init__(self, data_file):
#         self.data_file = data_file

#     def __iter__(self):
#         df = self.data_file
#         for _, row in df.iterrows():
#             sample = row.to_dict()
#             yield sample


# try:
#     train_dict = IterableAFQMC(traNin)
#     print(next(iter(train_dict)))
# except Exception as e:
#     print(f"Error: {e}")

### 向量化处理

In [64]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

checkpoint = "google-bert/bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collote_fn(batch_samples):
    try:
        batch_sentence_1, batch_sentence_2 = [], []
        batch_label = []
        batch_weights = []
        for sample in batch_samples:
            batch_sentence_1.append(sample['title1_en'])
            batch_sentence_2.append(sample['title2_en'])
            batch_label.append(int(sample['label_encoded']))
            if 'Weight' in sample:
                batch_weights.append(float(sample['Weight']))
            else:
                batch_weights.append(1.0)

        X = tokenizer(
            batch_sentence_1, 
            batch_sentence_2, 
            padding=True, 
            truncation=True, 
            return_tensors="pt"
        )

        y = torch.tensor(batch_label)
        w = torch.tensor(batch_weights)

        if 'Weight' in sample:
            return X, y, w
        else:
            return X, y
    except Exception as e:
        print(f"Error in collote_fn: {e}")
        raise

train_dataloader = DataLoader(train_dict, batch_size=2, shuffle=True, collate_fn=collote_fn)
valid_dataloader= DataLoader(test_merged_dict, batch_size=2, shuffle=True, collate_fn=collote_fn)


batch_X, batch_y = next(iter(train_dataloader))
print('batch_X shape:', {k: v.shape for k, v in batch_X.items()})
print('batch_y shape:', batch_y.shape)

batch_X shape: {'input_ids': torch.Size([2, 37]), 'token_type_ids': torch.Size([2, 37]), 'attention_mask': torch.Size([2, 37])}
batch_y shape: torch.Size([2])




In [65]:
batch_X1, batch_y1, batch_w1 = next(iter(valid_dataloader))
print('batch_X shape:', {k: v.shape for k, v in batch_X1.items()})
print('batch_y shape:', batch_y1.shape)
print(batch_w1.shape)

batch_X shape: {'input_ids': torch.Size([2, 48]), 'token_type_ids': torch.Size([2, 48]), 'attention_mask': torch.Size([2, 48])}
batch_y shape: torch.Size([2])
torch.Size([2])


# 训练模型

### 构建模型

In [66]:
import torch
import torch.nn as nn
from torch.nn import Transformer

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
print(f'Using {device} device')

class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, nhead, num_encoder_layers, dim_feedforward, num_classes):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(input_dim, dim_feedforward)
        self.transformer = Transformer(
            d_model=dim_feedforward,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=6,
            dim_feedforward=dim_feedforward,
            activation='relu',
            batch_first=True
        )
        self.fc = nn.Linear(dim_feedforward, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, src):
        src_tensor = src['input_ids']
        embedded = self.embedding(src_tensor)
        transformer_output = self.transformer(embedded, embedded)
        pooled_output = transformer_output.mean(dim=1)
        dropout_output = self.dropout(pooled_output)
        logits = self.fc(dropout_output)
        return logits

input_dim = 30000  
nhead = 8
num_encoder_layers = 6
dim_feedforward = 512
num_classes = 3  # 3 classes: agreed, disagreed, unrelated

model = TransformerClassifier(input_dim, nhead, num_encoder_layers, dim_feedforward, num_classes)
print(model)

Using cpu device
TransformerClassifier(
  (embedding): Embedding(30000, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-5): 6 x Transf

### 输出结构

In [67]:
# inputs = {key: value.to(device) for key, value in batch_X.items()}
# outputs = model(inputs)  # 使用GPU时统一设备

outputs = model(batch_X)
print(outputs.shape)

torch.Size([2, 3])


### 训练步骤数

In [68]:
from transformers import get_scheduler
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5, no_deprecation_warning=True)
epochs = 10
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

158190


### 训练，测试准备




#### 训练循环

In [69]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_step_num = (epoch-1)*len(dataloader)
    
    model.train()
    for step, (X, y) in enumerate(dataloader, start=1):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_step_num + step):>7f}')
        progress_bar.update(1)
    return total_loss

#### 测试循环

In [70]:
import numpy as np

def test_loop(dataloader, model, mode='Valid'):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    all_sample_weights = []

    with torch.no_grad():
        for batch in dataloader:
            try:
                inputs = batch[0].to(device)
                labels = batch[1].to(device)
                sample_weights = batch[2].to(device)

                outputs = model(inputs)
                loss_fn = nn.CrossEntropyLoss()
                loss = loss_fn(outputs, labels)

                total_loss += loss.item()

                all_predictions.append(outputs.argmax(dim=1).cpu().numpy())
                all_labels.append(labels.cpu().numpy())
                all_sample_weights.append(sample_weights.cpu().numpy())
            except Exception as e:
                print("Error encountered:", e)
                print("Problematic batch:", batch)
                break                                 

    all_predictions = np.concatenate(all_predictions)
    all_labels = np.concatenate(all_labels)
    all_sample_weights = np.concatenate(all_sample_weights)

    avg_loss = total_loss / len(dataloader)

    return avg_loss, all_predictions, all_labels, all_sample_weights


#### 加权准确率函数

In [71]:
def weighted_categorization_accuracy(predictions, labels, sample_weights):
    correct_predictions = (predictions == labels).astype(int)
    weighted_correct_predictions = correct_predictions * sample_weights
    
    weighted_accuracy = np.sum(weighted_correct_predictions) / np.sum(sample_weights)
    
    return weighted_accuracy

### 导入原最佳模型权重

In [72]:
model.load_state_dict(torch.load(f'./transformer_weights/epoch_3_valid_acc_64.3_transformer_weights.bin'))
model.eval()

TransformerClassifier(
  (embedding): Embedding(30000, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerDecoderLayer

### 训练执行

In [74]:
from transformers import AdamW, get_scheduler

learning_rate = 1e-5
epoch_num = 7

loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

total_loss = 0.
patience = 2
trigger_times = 0
best_acc = 0.
best_loss = float("inf")
try:
    for t in range(epoch_num):
        print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
        total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)

        val_loss, val_predictions, val_labels, val_sample_weights = test_loop(valid_dataloader, model, mode='Valid')

        valid_acc = weighted_categorization_accuracy(val_predictions, val_labels, val_sample_weights)
        print(f"Weighted Categorization Accuracy: {(100*valid_acc):0.2f}%")
        print(f"Validation Loss: {val_loss:>7f}")

        if valid_acc > best_acc:
            best_acc = valid_acc
            print('saving new weights...\n')
            torch.save(model.state_dict(), f'./transformer_weights/epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_transformer_weights.bin')
            model.load_state_dict(torch.load(f'./transformer_weights/epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_transformer_weights.bin'))

            
        #早停（如果在某个epoch中模型在验证集上的性能不再提升，可以提前停止训练）
        if val_loss < best_loss:
            best_loss = val_loss
            trigger_times = 0
        else:
            trigger_times += 1

        if trigger_times >= patience:
            print("Early stopping!")
            break
except Exception as e:
    print(f"Error encountered: {e}")
    torch.save(model.state_dict(), './transformer_weights/transformer_weights_on_error.bin')

print("Done!")

Epoch 1/7
-------------------------------


  0%|          | 0/15819 [00:00<?, ?it/s]

Weighted Categorization Accuracy: 63.63%
Validation Loss: 0.614431
saving new weights...

Epoch 2/7
-------------------------------


  0%|          | 0/15819 [00:00<?, ?it/s]

Weighted Categorization Accuracy: 64.72%
Validation Loss: 0.619105
saving new weights...

Epoch 3/7
-------------------------------


  0%|          | 0/15819 [00:00<?, ?it/s]

Weighted Categorization Accuracy: 68.87%
Validation Loss: 0.594560
saving new weights...

Epoch 4/7
-------------------------------


  0%|          | 0/15819 [00:00<?, ?it/s]

Weighted Categorization Accuracy: 68.13%
Validation Loss: 0.668118
Epoch 5/7
-------------------------------


  0%|          | 0/15819 [00:00<?, ?it/s]

Weighted Categorization Accuracy: 66.69%
Validation Loss: 0.818699
Early stopping!
Done!


### 加权得分计算

In [75]:
val_loss, val_predictions, val_labels, val_sample_weights = test_loop(valid_dataloader, model, mode='Test')

valid_acc = weighted_categorization_accuracy(val_predictions, val_labels, val_sample_weights)

print(f"Weighted Categorization Accuracy: {(100*valid_acc):0.2f}%")

Weighted Categorization Accuracy: 66.94%
