In [20]:
# DONE
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertModel
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np


In [21]:

# 加载数据
# data = pd.read_csv('../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv')
file_path = '../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_Example.csv'

data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")



In [None]:
nan_check = data['body'].isna().sum()
nan_check_c = data['category1'].isna().sum()
print(nan_check)
print(nan_check_c)

data = data.dropna(subset=['category1','body'])
nan_check = data['body'].isna().sum()
nan_check_c = data['category1'].isna().sum()
print(nan_check)
print(nan_check_c)

In [22]:
# # 加载BERT tokenizer
# model_path = '../bert-base-multilingual-cased'
# tokenizer = BertTokenizer.from_pretrained(model_path)

# # 加载微调后的BERT模型
# model = BertModel.from_pretrained(model_path)
# model.load_state_dict(torch.load('../models_FIX2/bert-base-multilingual-cased_classification_undersampled_new_epoch_20.pth'))

model_path = '../bert-base-multilingual-cased'
# modelNew_load_path = '../models_FIX2/bert-base-multilingual-cased_classification_undersampled_new_epoch_20.pth'

tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertModel.from_pretrained(model_path)

# model = BertForSequenceClassification.from_pretrained(model_path, num_labels=9)

# model.load_state_dict(torch.load(modelNew_load_path))



In [23]:
# 将模型移动到GPU(如果可用)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# # 使用BERT分类模型对新闻正文进行向量化
# def vectorize_text(texts):
#     inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
#     # inputs = {k: v.to(device) for k, v in inputs.items()}
#     inputs = {k: v for k, v in inputs.items()}

#     with torch.no_grad():
#         outputs = model(**inputs)
#     embeddings = outputs.logits.cpu().numpy()
#     return embeddings
# 使用BERT模型对新闻正文进行向量化
# 使用BERT模型对新闻正文进行向量化
def vectorize_text(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings
data['body']


0      ডিএমপিনিউজঃ ঢাকা মেট্রোপলিটন পুলিশের (ডিএমপি) ...
1      গোয়েন্দা পুলিশ (ডিবি) পরিচয়ে যুবককে তুলে নিয়ে ...
2      দুর্নীতি মামলায় দণ্ডিত হয়ে এক বছর ধরে কারাগারে...
3      জনসম্পৃক্তমূলক কর্মসূচি নিয়ে এবার মাঠে নামছে ব...
4      বিশিষ্ট অর্থনীতিবিদ ড. সেলিম জাহান কানাডা ও যু...
                             ...                        
995    রাজধানীতে একটি রুম থেকে কিশোরীসহ দুইজনের মরদেহ...
996    করোনা নামক বিশ্ব মহামারি এর হাত থেকে মানুষের জ...
997    পেটের অতিরিক্ত চর্বি বা ফ্যাটি লিভার স্বাস্থ্য...
998    বিএনপি মহাসচিব মির্জা ফখরুল ইসলাম আলমগীর বলেছে...
999    বাংলাদেশ প্রকৌশল বিশ্ববিদ্যালয় (বুয়েট)-এর দ্বি...
Name: body, Length: 1000, dtype: object

In [24]:
# 对新闻正文进行向量化
X = vectorize_text(data['body'].tolist())

# 将孟加拉语类别转换为数字标签
label_map = {label: i for i, label in enumerate(data['category1'].unique())}
y = data['category1'].map(label_map).tolist()


In [25]:
# type(X)
X.shape

(1000, 768)

In [26]:

# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建随机森林分类器
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练随机森林分类器
rf_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = rf_classifier.predict(X_test)

# 将数字标签转换回孟加拉语类别
label_map_inv = {i: label for label, i in label_map.items()}
y_test = [label_map_inv[i] for i in y_test]
y_pred = [label_map_inv[i] for i in y_pred]

# 评估模型性能
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    অন্যান্য       0.86      0.67      0.75        18
    অর্থনীতি       0.46      0.45      0.46        29
         আইন       0.58      0.67      0.62        27
    খেলাধুলা       0.81      0.91      0.86        23
     বিজ্ঞান       0.79      0.62      0.70        24
      বিনোদন       0.84      0.76      0.80        21
     রাজনীতি       0.67      0.72      0.69        25
  লাইফস্টাইল       0.79      0.65      0.71        17
      শিক্ষা       0.41      0.56      0.47        16

    accuracy                           0.67       200
   macro avg       0.69      0.67      0.67       200
weighted avg       0.68      0.67      0.67       200

