In [13]:
# DONE
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


In [2]:

# 加载数据
file_path = '../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'

data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")

# 加载BERT tokenizer和模型
model_name = '../bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


In [3]:
data.head()

Unnamed: 0,id,website_id,request_url,response_url,category1,category2,title,abstract,body,pub_time,cole_time,images,language_id,md5\r
0,17333224,2281,https://www.jugantor.com/sports/640555/%E0%A6%...,https://www.jugantor.com/sports/640555/%E0%A6%...,খেলাধুলা,,বিশ্বকাপে কোচের যে সিদ্ধান্তে বিস্মিত হন ডি মা...,কাতার বিশ্বকাপের ফাইনালে ফ্রান্সকে নাকাল করে ছ...,কাতার বিশ্বকাপের ফাইনালে ফ্রান্সকে নাকাল করে ছ...,1/2/2023 00:00:00,22/9/2023 10:10:18,"[""https://www.jugantor.com/assets/news_photos/...",1779,15c4e2f5f9cf435a7f1b5256b4b358ab\r
1,17150916,2270,https://dmpnews.org/%e0%a6%9f%e0%a6%bf%e0%a6%a...,https://dmpnews.org/%e0%a6%9f%e0%a6%bf%e0%a6%a...,খেলাধুলা,,টিভিতে আজকের খেলা,ডিএমপি নিউজঃ খেলাধুলা মানুষের জীবনে বিনোদনের এ...,ডিএমপি নিউজঃ খেলাধুলা মানুষের জীবনে বিনোদনের এ...,20/9/2018 16:53:29,22/9/2023 01:28:16,"[""https://dmpnews.org/wp-content/uploads/2018/...",1779,4331bda08631db3cf58e36ed2d18b30c\r
2,17092926,2270,https://dmpnews.org/%e0%a6%aa%e0%a7%8d%e0%a6%b...,https://dmpnews.org/%e0%a6%aa%e0%a7%8d%e0%a6%b...,খেলাধুলা,,প্রত্যাশিত বড় জয় নিয়েই বিশ্বকাপের মূল পর্বে বা...,সাকিব আল হাসানের অনবদ্য অলরাউন্ডার পারফমেন্সে ...,সাকিব আল হাসানের অনবদ্য অলরাউন্ডার পারফমেন্সে ...,21/10/2021 20:39:58,21/9/2023 22:52:51,"[""https://dmpnews.org/wp-content/uploads/2021/...",1779,024170153e9e7a6c9d78a877cb431650\r
3,17152100,2270,https://dmpnews.org/%e0%a6%ac%e0%a6%be%e0%a6%b...,https://dmpnews.org/%e0%a6%ac%e0%a6%be%e0%a6%b...,খেলাধুলা,,বার্সেলোনায় ফিরতে আকুল নেইমার,নিজের পুরনো ক্লাব বার্সেলোনায় ফিরতে আকুল ব্রাজ...,নিজের পুরনো ক্লাব বার্সেলোনায় ফিরতে আকুল ব্রাজ...,13/1/2019 10:12:00,22/9/2023 01:31:27,"[""https://dmpnews.org/wp-content/uploads/2019/...",1779,e8e211b5ff7d2b5556d8675831c87bf0\r
4,18749934,2277,https://www.dhakatimes24.com/2021/02/06/201644,https://www.dhakatimes24.com/2021/02/06/201644,খেলাধুলা,,দিনের প্রথম উইকেট মুশফিক,অধিনায়ক মুমিনুল হক এবং উইকেটকিপার ব্যাটসম্যান ...,অধিনায়ক মুমিনুল হক এবং উইকেটকিপার ব্যাটসম্যান ...,6/2/2021 10:49:00,30/9/2023 13:37:31,"[""https://www.dhakatimes24.com/assets/news_pho...",1779,4e18e2c6d69d7dc1a2d744f882b4e74e\r


In [7]:
nan_check = data['body'].isna().sum()
nan_check_c = data['category1'].isna().sum()
print(nan_check)
print(nan_check_c)

data = data.dropna(subset=['category1','body'])
nan_check = data['body'].isna().sum()
nan_check_c = data['category1'].isna().sum()
print(nan_check)
print(nan_check_c)

16
0
0
0


In [8]:

# 将模型移动到GPU(如果可用)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 使用BERT模型对新闻正文进行向量化
def vectorize_text(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

# 对新闻正文进行向量化
X = vectorize_text(data['body'].tolist())

# 将孟加拉语类别转换为数字标签
label_map = {label: i for i, label in enumerate(data['category1'].unique())}
y = data['category1'].map(label_map).tolist()


In [9]:

# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建XGBoost分类器
xgb_classifier = XGBClassifier(n_estimators=100, random_state=42)

# 训练XGBoost分类器
xgb_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = xgb_classifier.predict(X_test)

# 将数字标签转换回孟加拉语类别
label_map_inv = {i: label for label, i in label_map.items()}
y_test = [label_map_inv[i] for i in y_test]
y_pred = [label_map_inv[i] for i in y_pred]

# 评估模型性能
print("XGBoost Classifier:")
print(classification_report(y_test, y_pred))

XGBoost Classifier:
              precision    recall  f1-score   support

    অন্যান্য       0.87      0.87      0.87      2449
    অর্থনীতি       0.73      0.76      0.75      2517
         আইন       0.84      0.86      0.85      2474
    খেলাধুলা       0.98      0.95      0.96      2481
     বিজ্ঞান       0.84      0.81      0.82      2390
      বিনোদন       0.91      0.91      0.91      2473
     রাজনীতি       0.82      0.84      0.83      2451
  লাইফস্টাইল       0.87      0.85      0.86      2387
      শিক্ষা       0.81      0.82      0.82      2476

    accuracy                           0.85     22098
   macro avg       0.85      0.85      0.85     22098
weighted avg       0.85      0.85      0.85     22098



In [None]:
# XGBOOST:0.85

In [11]:
# LightGBM：
# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建LightGBM分类器
lgbm_classifier = LGBMClassifier(n_estimators=100, random_state=42)

# 训练LightGBM分类器
lgbm_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = lgbm_classifier.predict(X_test)

# 将数字标签转换回孟加拉语类别
label_map_inv = {i: label for label, i in label_map.items()}
y_test = [label_map_inv[i] for i in y_test]
y_pred = [label_map_inv[i] for i in y_pred]

# 评估模型性能
print("LightGBM Classifier:")
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.209771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 88388, number of used features: 768
[LightGBM] [Info] Start training from score -2.200273
[LightGBM] [Info] Start training from score -2.196908
[LightGBM] [Info] Start training from score -2.198946
[LightGBM] [Info] Start training from score -2.203341
[LightGBM] [Info] Start training from score -2.198946
[LightGBM] [Info] Start training from score -2.199252
[LightGBM] [Info] Start training from score -2.196603
[LightGBM] [Info] Start training from score -2.190414
[LightGBM] [Info] Start training from score -2.190414
LightGBM Classifier:
              precision    recall  f1-score   support

    অন্যান্য       0.88      0.87      0.87      2449
    অর্থনীতি       0.74      0.75      0.74      2517
         আইন       0.84      0.84     

In [None]:
# LightGBM:0.85

In [12]:
# LogisticsRegression：
# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建逻辑回归分类器
lr_classifier = LogisticRegression(max_iter=1000)

# 训练逻辑回归分类器
lr_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = lr_classifier.predict(X_test)

# 将数字标签转换回孟加拉语类别
label_map_inv = {i: label for label, i in label_map.items()}
y_test = [label_map_inv[i] for i in y_test]
y_pred = [label_map_inv[i] for i in y_pred]

# 评估模型性能
print("Logistic Regression Classifier:")
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Classifier:
              precision    recall  f1-score   support

    অন্যান্য       0.89      0.88      0.89      2449
    অর্থনীতি       0.79      0.79      0.79      2517
         আইন       0.86      0.88      0.87      2474
    খেলাধুলা       0.98      0.97      0.97      2481
     বিজ্ঞান       0.86      0.83      0.85      2390
      বিনোদন       0.92      0.92      0.92      2473
     রাজনীতি       0.86      0.87      0.86      2451
  লাইফস্টাইল       0.86      0.85      0.86      2387
      শিক্ষা       0.84      0.86      0.85      2476

    accuracy                           0.87     22098
   macro avg       0.87      0.87      0.87     22098
weighted avg       0.87      0.87      0.87     22098



In [None]:
# LogisticsRegression：0.87

In [14]:
# SVM：
# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建支持向量机分类器
svm_classifier = SVC()

# 训练支持向量机分类器
svm_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = svm_classifier.predict(X_test)

# 将数字标签转换回孟加拉语类别
label_map_inv = {i: label for label, i in label_map.items()}
y_test = [label_map_inv[i] for i in y_test]
y_pred = [label_map_inv[i] for i in y_pred]

# 评估模型性能
print("Support Vector Machine Classifier:")
print(classification_report(y_test, y_pred))

Support Vector Machine Classifier:
              precision    recall  f1-score   support

    অন্যান্য       0.87      0.85      0.86      2449
    অর্থনীতি       0.73      0.74      0.73      2517
         আইন       0.80      0.85      0.82      2474
    খেলাধুলা       0.98      0.95      0.97      2481
     বিজ্ঞান       0.84      0.78      0.81      2390
      বিনোদন       0.91      0.90      0.91      2473
     রাজনীতি       0.78      0.85      0.81      2451
  লাইফস্টাইল       0.86      0.82      0.84      2387
      শিক্ষা       0.81      0.81      0.81      2476

    accuracy                           0.84     22098
   macro avg       0.84      0.84      0.84     22098
weighted avg       0.84      0.84      0.84     22098



In [None]:
# SVM：0.84

In [17]:
# RandomForest：
# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建随机森林分类器
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练随机森林分类器
rf_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = rf_classifier.predict(X_test)

# 将数字标签转换回孟加拉语类别
label_map_inv = {i: label for label, i in label_map.items()}
y_test = [label_map_inv[i] for i in y_test]
y_pred = [label_map_inv[i] for i in y_pred]

# 评估模型性能
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    অন্যান্য       0.87      0.81      0.84      2449
    অর্থনীতি       0.67      0.67      0.67      2517
         আইন       0.75      0.79      0.77      2474
    খেলাধুলা       0.97      0.93      0.95      2481
     বিজ্ঞান       0.83      0.74      0.78      2390
      বিনোদন       0.85      0.89      0.87      2473
     রাজনীতি       0.70      0.80      0.74      2451
  লাইফস্টাইল       0.86      0.81      0.83      2387
      শিক্ষা       0.71      0.72      0.72      2476

    accuracy                           0.80     22098
   macro avg       0.80      0.80      0.80     22098
weighted avg       0.80      0.80      0.80     22098



In [None]:
#RandomForest：0.80（test_size = 0.2）