In [9]:
# NO
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

In [10]:
# 加载数据
file_path = '../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_Example.csv'

data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")



In [11]:
# 加载BERT tokenizer和模型
model_name = '../bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 将模型移动到GPU(如果可用)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print("DONE")

DONE


In [12]:
# 使用BERT模型对新闻正文进行向量化
def vectorize_text(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

# 对新闻正文进行向量化
X = vectorize_text(data['body'].tolist())

# 将孟加拉语类别转换为数字标签
label_map = {label: i for i, label in enumerate(data['category1'].unique())}
y = data['category1'].map(label_map).tolist()



In [13]:
X

array([[ 0.00360673,  0.27257353, -0.1332579 , ...,  0.12291296,
         0.0373899 , -0.08065002],
       [-0.07881407,  0.17497417, -0.2032573 , ...,  0.03470398,
         0.09994052, -0.07378072],
       [-0.04129403,  0.17943518, -0.2638795 , ...,  0.06543663,
         0.08946389, -0.17144516],
       ...,
       [-0.01700522,  0.09620079,  0.02577472, ...,  0.08650915,
         0.11759964, -0.0469637 ],
       [ 0.01752884,  0.04135757, -0.05827464, ...,  0.02767651,
         0.03450951, -0.09344518],
       [-0.04199621,  0.05297507, -0.10083117, ..., -0.06108994,
         0.08365133, -0.07983957]], dtype=float32)

In [14]:
# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建朴素贝叶斯分类器
nb_classifier = MultinomialNB()

# 训练朴素贝叶斯分类器
nb_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = nb_classifier.predict(X_test)

# 将数字标签转换回孟加拉语类别
label_map_inv = {i: label for label, i in label_map.items()}
y_test = [label_map_inv[i] for i in y_test]
y_pred = [label_map_inv[i] for i in y_pred]

# 评估模型性能
print("Naive Bayes Classifier:")
print(classification_report(y_test, y_pred))

# ×朴素贝叶斯方法不支持负数，bert向量中存在负数

ValueError: Negative values in data passed to MultinomialNB (input X)