In [9]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import ClusterCentroids
import pandas as pd
import numpy as np
from collections import Counter


# 加载数据集
df = pd.read_csv('./deduplicated_mangoNews_Nums3000p_CategoryMerge_new_10000.csv')  # 假设你有一个CSV文件包含数据
# 假设你的数据集有 'body' 列和 'category1' 列

# 初始化 BERT tokenizer 和模型
Bert_path = './uncased_L-12_H-768_A-12'
tokenizer = BertTokenizer.from_pretrained(Bert_path)
model = BertModel.from_pretrained(Bert_path)

# 对文本进行编码
max_length = 384  # 设定一个合适的最大长度，可以根据你的数据进行调整
encoded_texts = []
for text in df['body']:
    inputs = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    encoded_text = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    encoded_texts.append(encoded_text)


X = np.vstack(encoded_texts)
print(X)
print(X.shape)

# 对类别标签进行编码
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category1'])
print(y)
print(y.shape)
print(Counter(y))

# 数据平衡处理
target_count = 500
class_counts = np.bincount(y)
print(class_counts)

adasyn = ADASYN(sampling_strategy={cls: target_count for cls, count in enumerate(class_counts) if count < target_count})
cluster_centroids = ClusterCentroids(sampling_strategy={cls: target_count for cls, count in enumerate(class_counts) if count > target_count})

X_resampled, y_resampled = adasyn.fit_resample(X, y)
print(X_resampled.shape)
print(y_resampled.shape)
Counter(y_resampled)
X_resampled, y_resampled = cluster_centroids.fit_resample(X_resampled, y_resampled)
print(X_resampled.shape)
print(y_resampled.shape)
Counter(y_resampled)

# 现在可以继续使用 X_resampled 和 y_resampled 进行模型训练


[[ 0.31978938  0.18256517  0.27844584 ... -0.5047224   0.52765423
  -0.96845144]
 [ 0.26611075  0.15237056  0.28633723 ... -0.6395822   0.56409585
  -1.0826756 ]
 [ 0.32402256  0.2667769   0.37864402 ... -0.6582319   0.50884926
  -1.014228  ]
 ...
 [ 0.27147102  0.04560753  0.2685072  ... -0.43168148  0.6387473
  -1.0556158 ]
 [ 0.36678466  0.17260373  0.38308302 ... -0.58664805  0.5102701
  -1.0430468 ]
 [ 0.2669459   0.12637357  0.39293042 ... -0.508324    0.53113097
  -1.1132029 ]]
(10000, 768)
[0 0 0 ... 0 4 4]
(10000,)
Counter({0: 7589, 2: 923, 1: 709, 6: 217, 5: 199, 3: 194, 4: 169})
[7589  709  923  194  169  199  217]
(11254, 768)
(11254,)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


(3533, 768)
(3533,)


Counter({0: 500, 1: 500, 2: 500, 3: 477, 4: 488, 5: 529, 6: 539})

In [5]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN
X, y = make_classification(n_classes=2, class_sep=2,
                           weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1, n_samples=1000,
                           random_state=10)
print(X.shape)
print(y.shape)
print('Original dataset shape %s' % Counter(y))
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X, y)
print(X_res.shape)
print(y_res.shape)
print('Resampled dataset shape %s' % Counter(y_res))


(1000, 20)
(1000,)
Original dataset shape Counter({1: 900, 0: 100})
(1804, 20)
(1804,)
Resampled dataset shape Counter({0: 904, 1: 900})


In [1]:
# !pip install augly
import pandas as pd
from sklearn.utils import resample
# from augly.text import augmentation
from augly.text import augmenters as augmentation
import random

# 加载数据集
df = pd.read_csv('./deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv')  # 假设你有一个CSV文件包含数据
# 假设你的数据集有 'body' 列和 'category1' 列
print("!")
# 提取 body 列和 category1 列
df = df[['body', 'category1']]

# 定义类别对应的语料数目
class_counts = {
    'অন্যান্য': 361535,
    'জাতীয়': 343724,
    'আন্তর্জাতিক': 119419,
    'খেলাধুলা': 111943,
    'রাজনীতি': 55988,
    'বিনোদন': 53591,
    'অর্থনীতি': 46574,
    'আইন': 34876,
    'আর্কাইভ': 28279,
    'শিক্ষা': 22374,
    'বিজ্ঞান': 12335,
    'লাইফস্টাইল': 12095
}

# 数据平衡处理
target_count = 30000
for category, count in class_counts.items():
    if count > target_count:
        # 随机欠采样，使语料数量约等于30000
        df_category = df[df['category1'] == category]
        df_downsampled = resample(df_category, replace=False, n_samples=target_count, random_state=42)
        df = pd.concat([df[df['category1'] != category], df_downsampled])
        print(df)
    elif count < target_count:
        # 使用AugLy进行数据增强，使语料数量约等于30000
        df_category = df[df['category1'] == category]
        num_augmentations = target_count - count
        sampled_indices = random.sample(range(len(df_category)), num_augmentations)
        for idx in sampled_indices:
            original_text = df_category.iloc[idx]['body']
            augmented_text = augmentation.replace_similar_unicode(original_text)
            df = df.append({'body': augmented_text, 'category1': category}, ignore_index=True)
        print(df)

# 确保各个类别对应的语料数量大致为30000
class_counts_final = df['category1'].value_counts()
print(class_counts_final)

# 现在可以继续使用处理后的数据进行后续任务


  from .autonotebook import tqdm as notebook_tqdm
  df = pd.read_csv('./deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv')  # 假设你有一个CSV文件包含数据


!
                                                       body   category1
1537      ডরিন হোটেলে স্ট্রিট ফুড মার্কেটভোজনরসিকদের জন্...  লাইফস্টাইল
1538      বাজারে এল অ্যাপলের নতুন ম্যাকবুক প্রোর দুটি মড...  লাইফস্টাইল
1539      উপহার সব সময় যে দামিই হতে হবে, এমন কোনো দিব্যি...  লাইফস্টাইল
1540      ফ্যাশনের আলাপে ঘুরেফিরেই আসে ডাচেস অব কেমব্রিজ...  লাইফস্টাইল
1541      ২৮ মে শেষ হয়ে যাবে ৭৫তম কান চলচ্চিত্র উৎসব। চল...  লাইফস্টাইল
...                                                     ...         ...
21533495                                                NaN    অন্যান্য
9132841                                                 NaN    অন্যান্য
6240778                                                 NaN    অন্যান্য
21014746                                                NaN    অন্যান্য
14414911                                                NaN    অন্যান্য

[35276919 rows x 2 columns]
                                                       body   category1
1537      ডরিন হোটেলে স্ট্রিট ফুড

AttributeError: module 'augly.text.augmenters' has no attribute 'replace_similar_unicode'

In [3]:
# 暂时使用这个！！使用随机欠采样对数据集进行数据平衡，最终数据集为12095*12=145140条 1.2g，改进方向：欠采样（culster）+过采样（adasyn，augly增强）
# update:3.7
# update：3.8
import pandas as pd
from sklearn.utils import resample

# 读取CSV文件
# df = pd.read_csv('./deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv', low_memory=False,lineterminator="\n")
# df = pd.read_csv('./datasets_FIX/FIX_deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv', low_memory=False,lineterminator="\n")
# df = pd.read_csv('./datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv', low_memory=False,lineterminator="\n")
df = pd.read_csv('./datasets_FIX3/FIX3_deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv', low_memory=False,lineterminator="\n")

# datasets_FIX/FIX_deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv

# 统计各类别对应的语料数
category_counts = df['category1'].value_counts().to_dict()

# 找出最小的语料数
min_corpus_count = min(category_counts.values())

# 对每个类别进行欠采样
undersampled_dfs = []
for category, count in category_counts.items():
    # 获取当前类别的所有索引
    indices = df[df['category1'] == category].index
    # 如果当前类别的语料数大于最小语料数，则进行欠采样
    if count > min_corpus_count:
        undersampled_indices = resample(indices, replace=False, n_samples=min_corpus_count, random_state=42)
        undersampled_df = df.loc[undersampled_indices]
        undersampled_dfs.append(undersampled_df)
    else:
        undersampled_dfs.append(df.loc[indices])

# 将欠采样后的DataFrame合并为一个新的DataFrame
undersampled_df = pd.concat(undersampled_dfs)

# 输出欠采样后的类别计数
print("\n欠采样后的类别计数:")
print(undersampled_df['category1'].value_counts())

undersampled_df.head()

# 保存处理后的结果到新的CSV文件
# undersampled_df.to_csv('./datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv', index=False)
undersampled_df.to_csv('./datasets_FIX3/FIX3_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv', index=False)




欠采样后的类别计数:
খেলাধুলা      6431
রাজনীতি       6431
বিনোদন        6431
অর্থনীতি      6431
আইন           6431
শিক্ষা        6431
বিজ্ঞান       6431
লাইফস্টাইল    6431
অন্যান্য      6431
Name: category1, dtype: int64


In [4]:
# 对数据平衡前数据集进行随机抽取145140条用于bert训练后与数据平衡后数据效果进行对比
# update：3.8
import pandas as pd

# 读取CSV文件
# df = pd.read_csv('./deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv', low_memory=False,lineterminator="\n")
# df = pd.read_csv('./datasets_FIX/FIX_deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv', low_memory=False,lineterminator="\n")
# df = pd.read_csv('./datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv', low_memory=False,lineterminator="\n")
df = pd.read_csv('./datasets_FIX3/FIX3_deduplicated_mangoNews_Nums3000p_CategoryMerge_new.csv', low_memory=False,lineterminator="\n")

# 随机采样145140条数据
sampled_df = df.sample(n=57879, random_state=42)

# 保存采样结果到新的CSV文件
# sampled_df.to_csv('./datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_randsample110502.csv', index=False)
sampled_df.to_csv('./datasets_FIX3/FIX3_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_randsample57879.csv', index=False)

# 输出采样后的数据信息
print("采样后的数据信息:")
print(sampled_df.info())


采样后的数据信息:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 57879 entries, 89218 to 274415
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            57879 non-null  int64 
 1   website_id    57879 non-null  int64 
 2   request_url   57879 non-null  object
 3   response_url  57879 non-null  object
 4   category1     57879 non-null  object
 5   category2     933 non-null    object
 6   title         57879 non-null  object
 7   abstract      57860 non-null  object
 8   body          57871 non-null  object
 9   pub_time      57879 non-null  object
 10  cole_time     57879 non-null  object
 11  images        57879 non-null  object
 12  language_id   57879 non-null  int64 
          57879 non-null  object
dtypes: int64(3), object(11)
memory usage: 6.6+ MB
None


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# 划分训练集测试集用于ALbert+seq2seq_attention
# 读取CSV文件
# file_path = './datasets/mangoNews_Example.csv'  # 测试版数据集（规模小5m）
# file_path = './datasets/mangoNews.csv'          # 完整版数据集（13g） 需要分块读
# file_path = './datasets/mangoNews_Example_100000.csv'          # 100000行数据集（2.2g）
# file_path = './datasets/mangoNews_Example_10000.csv'          # 10000行数据集（190m)
# file_path = './datasets/deduplicated_mangoNews_Nums3000p_CategoryMerge.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别的数据集（9.9g) 需要分块读
# file_path = './datasets/deduplicated_mangoNews_Nums3000p_CategoryMerge_100000_1.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别的数据集（576m) 
# file_path = './datasets/deduplicated_mangoNews_Nums3000p_CategoryMerge_990000.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别的数据集（8.1g) 
# file_path = './datasets/deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别(新）并随机欠采样数据平衡的数据集（12095*12条 1.2g) 
# file_path = './datasets/deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_Example.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别(新）并随机欠采样数据平衡的数据集（12095*12条 1.2g) 

# file_path = './datasets_FIX/FIX_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'
# file_path = './datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'
file_path = './datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_Example.csv'


data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")

# Select relevant columns
data = data[['body', 'category1']]

category_counts = data['category1'].value_counts()

# 设置显示选项，完整输出结果
pd.set_option('display.max_rows', None)
print("Category Counts:")
print(category_counts.index)
# 恢复默认显示选项
pd.reset_option('display.max_rows')

# 划分训练集和测试集
train_data, test_data = train_test_split(data, test_size=0.15, random_state=42) ## 2.20 test_size:0.2->0.3

# 保存训练集到 train.csv
train_data.to_csv('./datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_train_Example.csv', index=False)
# train_data.to_csv('./datasets/mangoNews_Example_train.csv', index=False)

# 保存测试集到 test.csv
test_data.to_csv('./datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_test_Example.csv', index=False)
# test_data.to_csv('./datasets/mangoNews_Example_test.csv', index=False)


# 统计 'category1' 列中每种类别的个数
category_counts1 = train_data['category1'].value_counts()

# 设置显示选项，完整输出结果
pd.set_option('display.max_rows', None)
print("Category Counts1:")
print(category_counts1)
# 恢复默认显示选项
pd.reset_option('display.max_rows')

category_counts2 = test_data['category1'].value_counts()
print("===================================================")
# 设置显示选项，完整输出结果
pd.set_option('display.max_rows', None)
print("Category Counts2:")
print(category_counts2)
# 恢复默认显示选项
pd.reset_option('display.max_rows')


Category Counts:
Index(['খেলাধুলা', 'আইন', 'রাজনীতি', 'অর্থনীতি', 'লাইফস্টাইল', 'শিক্ষা',
       'অন্যান্য', 'বিজ্ঞান', 'বিনোদন'],
      dtype='object')
Category Counts1:
খেলাধুলা      105
লাইফস্টাইল    102
অর্থনীতি      101
আইন            97
রাজনীতি        97
শিক্ষা         94
অন্যান্য       91
বিজ্ঞান        83
বিনোদন         80
Name: category1, dtype: int64
Category Counts2:
আইন           22
রাজনীতি       21
বিজ্ঞান       19
অর্থনীতি      17
খেলাধুলা      16
বিনোদন        15
শিক্ষা        14
অন্যান্য      14
লাইফস্টাইল    12
Name: category1, dtype: int64


In [1]:
import pandas as pd

# 读取原始数据集
# file_path = './datasets/deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'
file_path = './datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'

data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")

# 随机抽样1000条数据
sampled_data = data.sample(n=1000, random_state=42)

# 将抽样后的数据保存到新的CSV文件中
sampled_data.to_csv('./datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_Example.csv', index=False)
