In [2]:
# DONE
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from lightgbm import LGBMClassifier


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195808
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 768
[LightGBM] [Info] Start training from score -2.162823
[LightGBM] [Info] Start training from score -2.152012
[LightGBM] [Info] Start training from score -2.218704
[LightGBM] [Info] Start training from score -2.109901
[LightGBM] [Info] Start training from score -2.380547
[LightGBM] [Info] Start training from score -2.099644
[LightGBM] [Info] Start training from score -2.195975
[LightGBM] [Info] Start training from score -2.327903
[LightGBM] [Info] Start training from score -2.162823
LightGBM Classifier:
              precision    recall  f1-score   support

    অন্যান্য       0.72      0.72      0.72        18
    অর্থনীতি       0.50      0.38      0.43        29
         আইন       0.50      0.52      0

In [None]:

# 加载数据
file_path = '../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_Example.csv'

data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")

# 加载BERT tokenizer和模型
model_name = '../bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


In [None]:
nan_check = data['body'].isna().sum()
nan_check_c = data['category1'].isna().sum()
print(nan_check)
print(nan_check_c)

data = data.dropna(subset=['category1','body'])
nan_check = data['body'].isna().sum()
nan_check_c = data['category1'].isna().sum()
print(nan_check)
print(nan_check_c)

In [None]:

# 将模型移动到GPU(如果可用)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 使用BERT模型对新闻正文进行向量化
def vectorize_text(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

# 对新闻正文进行向量化
X = vectorize_text(data['body'].tolist())

# 将孟加拉语类别转换为数字标签
label_map = {label: i for i, label in enumerate(data['category1'].unique())}
y = data['category1'].map(label_map).tolist()

# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建LightGBM分类器
lgbm_classifier = LGBMClassifier(n_estimators=100, random_state=42)

# 训练LightGBM分类器
lgbm_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = lgbm_classifier.predict(X_test)

# 将数字标签转换回孟加拉语类别
label_map_inv = {i: label for label, i in label_map.items()}
y_test = [label_map_inv[i] for i in y_test]
y_pred = [label_map_inv[i] for i in y_pred]

# 评估模型性能
print("LightGBM Classifier:")
print(classification_report(y_test, y_pred))