In [8]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [9]:
# 資料讀取和預處理
data = pd.read_csv('processed_data.csv')
data['標籤'] = data['標籤'].fillna('').astype(str)
data['屬性'] = data['屬性'].fillna('').astype(str)

# 從屬性列提取所有標籤
all_attributes = []
for attr_string in data['屬性']:
    # 分割每個屬性字符串，處理有多個標籤的情況
    attributes = [attr.strip() for attr in attr_string.split(',') if attr.strip()]
    # 將分割後的標籤添加到總列表
    all_attributes.extend(attributes)

# 獲取唯一屬性值
unique_attributes = sorted(list(set(all_attributes)))


# 超參數設定
class Config:
    MAX_LEN = 128
    BATCH_SIZE = 32
    TRAIN_SPLIT = 0.8
    DROPOUT = 0.2
    HIDDEN_SIZE = 768
    EPOCHS = 25
    LEARNING_RATE = 5e-5
    WEIGHT_DECAY = 0.001
    WARMUP_RATIO = 0.1
    EARLY_STOPPING_PATIENCE = 3
    MIN_DELTA = 0.001
    # 初始化權重字典
    CATEGORY_WEIGHTS = {}

    # 為所有屬性標籤設置權重1.1
    for attr in unique_attributes:
        CATEGORY_WEIGHTS[attr] = 1.1

# BERT模型初始化
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [10]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 設備配置
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

device = get_device()
print(f"使用設備: {device}")
if torch.cuda.is_available():
    print(f"GPU名稱: {torch.cuda.get_device_name(0)}")
    print(f"GPU記憶體: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

使用設備: mps


In [11]:
# 標籤處理
le = LabelEncoder()
data['label'] = le.fit_transform(data['屬性'])
data['combined_features'] = data['屬性'] + ' ' + data['標籤']

# 分割訓練和驗證集
train_data, val_data = train_test_split(
    data,
    test_size=1-Config.TRAIN_SPLIT,
    random_state=42
)

# 創建數據集
train_dataset = CustomDataset(
    texts=train_data['combined_features'].to_numpy(),
    labels=train_data['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=Config.MAX_LEN
)

val_dataset = CustomDataset(
    texts=val_data['combined_features'].to_numpy(),
    labels=val_data['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=Config.MAX_LEN
)

# 創建數據加載器
train_dataloader = DataLoader(
    train_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=True,
    drop_last=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=False
)

In [12]:
# 模型設定
num_labels = len(data['label'].unique())
model = BertForSequenceClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=num_labels,
    hidden_dropout_prob=Config.DROPOUT
).to(device)

# 優化器設定
optimizer = AdamW(
    model.parameters(),
    lr=Config.LEARNING_RATE,
    weight_decay=Config.WEIGHT_DECAY
)

# 學習率調度器
total_steps = len(train_dataloader) * Config.EPOCHS
warmup_steps = int(total_steps * Config.WARMUP_RATIO)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# 早停機制
class EarlyStopping:
    def __init__(self, patience=Config.EARLY_STOPPING_PATIENCE, min_delta=Config.MIN_DELTA):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.should_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

early_stopping = EarlyStopping()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# 訓練循環
best_loss = float('inf')
best_accuracy = 0

for epoch in range(Config.EPOCHS):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    # 訓練階段
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{Config.EPOCHS} [Train]')
    for batch in progress_bar:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'text'}

        outputs = model(**inputs)
        loss = outputs.loss

        predictions = torch.argmax(outputs.logits, dim=1)
        correct_predictions += (predictions == inputs['labels']).sum().item()
        total_predictions += inputs['labels'].size(0)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        current_accuracy = correct_predictions / total_predictions
        current_lr = scheduler.get_last_lr()[0]
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'accuracy': f'{current_accuracy:.4f}',
            'lr': f'{current_lr:.2e}'
        })

    train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_predictions

    # 驗證階段
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        progress_bar = tqdm(val_dataloader, desc=f'Epoch {epoch+1}/{Config.EPOCHS} [Validation]')
        for batch in progress_bar:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'text'}
            outputs = model(**inputs)

            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=1)
            val_correct += (predictions == inputs['labels']).sum().item()
            val_total += inputs['labels'].size(0)

    val_loss = val_loss / len(val_dataloader)
    val_accuracy = val_correct / val_total

    print(f'\nEpoch {epoch+1}:')
    print(f'Training    - Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}')
    print(f'Validation  - Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}')
    print(f'Learning Rate: {current_lr:.2e}')

    # 儲存最佳模型
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_model1000.pt')
        print(f'Saved new best model with accuracy: {val_accuracy:.4f}')

    # 早停檢查
    early_stopping(val_loss)
    if early_stopping.should_stop:
        print("Early stopping triggered")
        break


[A
[A
Epoch 1/25 [Train]:   2%|▏         | 1/46 [02:28<1:51:06, 148.14s/it, loss=5.6686, accuracy=0.0312, lr=4.35e-07]


KeyboardInterrupt: 

In [9]:
# 載入最佳模型
model.load_state_dict(torch.load('/content/best_model1000.pt'))
model.eval()

# 生成商品向量
def get_product_vector(text):
    encoded_input = tokenizer(text, return_tensors='pt',
                            max_length=512,
                            truncation=True,
                            padding='max_length').to(device)
    with torch.no_grad():
        output = model.bert(**encoded_input)
    return output.last_hidden_state[:, 0, :].cpu().numpy()

print("\n生成商品向量...")
tqdm.pandas()
data['向量'] = data['combined_features'].progress_apply(get_product_vector)
product_vectors = np.vstack(data['向量'].values)


生成商品向量...


  0%|          | 0/1872 [00:00<?, ?it/s]

In [10]:
# 計算加權相似度
def calculate_weighted_similarity(idx1, idx2, similarity_matrix):
    base_similarity = similarity_matrix[idx1, idx2]

    # 屬性權重
    attribute_factor = 1.0
    if data.iloc[idx1]['屬性'] == data.iloc[idx2]['屬性']:
        attribute = data.iloc[idx1]['屬性']
        attribute_factor = Config.CATEGORY_WEIGHTS.get(attribute, 1.0)

    # 標籤相似度分析
    tags1 = set(data.iloc[idx1]['標籤'].split(','))
    tags2 = set(data.iloc[idx2]['標籤'].split(','))

    # 計算標籤的Jaccard相似度
    tag_similarity = 0.0
    if tags1 and tags2:
        intersection = len(tags1.intersection(tags2))
        union = len(tags1.union(tags2))
        tag_similarity = intersection / union if union > 0 else 0

    # 組合基本相似度、屬性權重和標籤相似度
    # 調整權重可以改變各部分的影響
    final_similarity = 0.7 * base_similarity * attribute_factor + 0.3 * tag_similarity

    return min(final_similarity, 1.0)

print("計算相似度矩陣...")
base_similarity_matrix = cosine_similarity(product_vectors)

def get_recommendations(product_index, top_n=5):
    similarities = np.array([
        calculate_weighted_similarity(product_index, i, base_similarity_matrix)
        for i in range(len(data))
    ])

    similarities[product_index] = -1
    top_indices = similarities.argsort()[-top_n:][::-1]
    return [(idx, similarities[idx]) for idx in top_indices]

def print_recommendations(product_index):
    print(f"\n商品資訊:")
    print(f"名稱: {data.iloc[product_index]['商品名稱']}")
    print(f"屬性: {data.iloc[product_index]['屬性']}")
    print(f"標籤: {data.iloc[product_index]['標籤']}")
    print("\n推薦商品:")

    recommendations = get_recommendations(product_index)
    for idx, score in recommendations:
        print(f"\n推薦商品 {idx}:")
        print(f"名稱: {data.iloc[idx]['商品名稱']}")
        print(f"屬性: {data.iloc[idx]['屬性']}")
        print(f"標籤: {data.iloc[idx]['標籤']}")
        print(f"相似度: {score:.4f}")

計算相似度矩陣...


In [12]:
# 測試推薦系統
test_product_index = 10
print_recommendations(test_product_index)


商品資訊:
名稱: INSIS｜Nordic Romantic Casual Pleated Large Pocket Skirt＃北歐浪漫休閒空氣垂感抽褶拼接大口袋傘裙
屬性: 傘裙
標籤: 拼接,大口袋

推薦商品:

推薦商品 14:
名稱: INSIS｜Nordic Romantic Casual Pleated Large Pocket Skirt＃北歐浪漫休閒空氣垂感抽褶拼接大口袋傘裙
屬性: 傘裙
標籤: 大口袋,拼接
相似度: 1.0000

推薦商品 713:
名稱: No.42｜Sand Row Hollow Jacquard Mosaic Loose Skirt＃沙行鏤空緹花拼接傘襬裙
屬性: 傘裙
標籤: 鏤空,緹花,傘襬,拼接,中長,緹花布
相似度: 0.8058

推薦商品 641:
名稱: INSIS｜2Ways Asymmetric Pleated Half Skirt＃前後兩穿壓褶斜紋垂感傘裙
屬性: 傘裙
標籤: 斜紋,壓褶,前後兩穿,抽褶
相似度: 0.7669

推薦商品 667:
名稱: INSIS｜Ruffled Detail Pleated Half Skirt＃水果酵洗壓褶木耳邊傘裙
屬性: 傘裙
標籤: 壓褶,長絨棉,抽褶
相似度: 0.7662

推薦商品 853:
名稱: Lace Patch Double Layered Tulle Skirt蕾絲拼接黑白網紗裙
屬性: 紗裙
標籤: 拼接,裙,網紗,蕾絲,紗裙
相似度: 0.3997
