# TF-IDF

In [None]:
#安装相关依赖库 如果是windows系统，cmd命令框中输入pip安装，参考上述环境配置
#!pip install sklearn
#!pip install pandas
#---------------------------------------------------
#导入库
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

train_df = pd.read_json('./data/train.json', encoding='utf8', lines=True)
testA_df = pd.read_json('./data/testA.json', encoding='utf8', lines=True)

#----------------特征工程----------------
#将论文的标题与摘要组合为 text 特征
train_df['title'] = train_df['title'].apply(lambda x: x.strip())
train_df['abstract'] = train_df['abstract'].fillna('').apply(lambda x: x.strip())
train_df['text'] = train_df['title'].str.lower() + '[SEP]' + train_df['abstract'].str.lower()

testA_df['title'] = testA_df['title'].apply(lambda x: x.strip())
testA_df['abstract'] = testA_df['abstract'].fillna('').apply(lambda x: x.strip())
testA_df['text'] = testA_df['title'].str.lower() + '[SEP]' + testA_df['abstract'].str.lower()

#使用tfidf算法做文本特征提取
tfidf = TfidfVectorizer(max_features=2500)

#----------------模型训练----------------

train_tfidf = tfidf.fit_transform(train_df['text'])
clf = SGDClassifier()
cross_val_score(clf, train_tfidf, train_df['label_id'], cv=5)

test_tfidf = tfidf.transform(testA_df['text'])
clf = SGDClassifier()
clf.fit(train_tfidf, train_df['label_id'])
testA_df['label'] = clf.predict(test_tfidf)

#----------------结果输出----------------
testA_df[['id', 'label']].to_csv('submit/submit (tf-idf).csv', index=None)

In [None]:
# coding=utf-8
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


# 训练集中词频统计，并计算TF值
def words_tf():
    train_data = pd.read_csv('data/train.tsv', sep='\t').astype(str)
    sentence_list = []
    lenth = len(train_data)
    for i in range(lenth):
        sentence_list.append(str(train_data['text'][i]).split())
    # 总词频统计
    doc_frequency = defaultdict(int)
    for word_list in sentence_list:
        for i in word_list:
            doc_frequency[i] += 1

    # 计算每个词的TF值
    word_tf = {}  # 存储每个词的tf值
    for i in doc_frequency:
        word_tf[i] = doc_frequency[i] / sum(doc_frequency.values())

    words_tf = sorted(word_tf.items(), key=lambda x: x[1], reverse=True)
    return words_tf[:10000]


# 根据词频，将文本转换为向量
def word2vec(keywords_tf, doc_sentence):
    keywords = list(dict(keywords_tf).keys())  # 获取关键词
    tf_weight = list(dict(keywords_tf).values())  # 获取关键词tf值

    docvec_list = []
    for sentence in doc_sentence:
        docvec = [0] * len(keywords_tf)
        for word in sentence:
            if word in keywords:
                docvec[keywords.index(word)] = tf_weight[keywords.index(word)]
        docvec_list.append(docvec)
    return docvec_list


# 将训练集和测试集换为文本向量
def doc_vec(x_train, x_test):
    keywords_tf = words_tf()  # 获取词频关键词

    # 训练集转换为向量
    train_lenth = len(x_train)
    train_data_list = []
    for i in range(train_lenth):
        train_data_list.append(str(x_train[i]).split())
    train_docvec_list = word2vec(keywords_tf, train_data_list)

    # 测试集转换为向量
    test_lenth = len(x_test)
    test_data_list = []
    for i in range(test_lenth):
        test_data_list.append(str(x_test[i]).split())
    test_docvec_list = word2vec(keywords_tf, test_data_list)

    return train_docvec_list, test_docvec_list


if __name__ == '__main__':
    train_data = pd.read_csv('data/train.tsv', sep='\t').astype(str)
    x_train, x_test, y_train, y_test = train_test_split(train_data['text'], train_data['label_id'], test_size=0.05)

    x_train = np.array(x_train)
    x_test = np.array(x_test)
    cw = lambda x: int(x)
    y_train = np.array(y_train.apply(cw))
    y_test = np.array(y_test.apply(cw))

    x_train, x_test = doc_vec(x_train, x_test)  # 训练集和测试集向量化
    x_train, y_train = shuffle(x_train, y_train, random_state=0)  # 打乱顺序

    # 导入SelectFromModel结合ExtraTreesClassifier计算特征重要性，并按重要性阈值选择特征。
    clf_model = ExtraTreesClassifier(n_estimators=250, random_state=0)
    # clf_model=RandomForestClassifier(n_estimators=250,random_state=0)
    clf_model.fit(x_train, y_train)
    # 获取每个词的特征权重,数值越高特征越重要l
    importances = clf_model.feature_importances_

    '''
    # 将词和词的权重存入字典并写入文件
    feature_words_dic = {}
    for i in range(len(words_list)):
        feature_words_dic[words_list[i][0]] = importances[i]
    # 对字典按权重由大到小进行排序
    words_info_dic_sort = sorted(feature_words_dic.items(), key=lambda x: x[1], reverse=True)
    #将前2000个词的权重字典写入文件
    key_words_importance=dict(words_info_dic_sort[:2000])
    with open('data/key_words_importance','w') as f:
        f.write(str(key_words_importance))
    '''

    # 选择特征重要性为1.5倍均值的特征
    model = SelectFromModel(clf_model, threshold='1.5*mean', prefit=True)
    x_train_new = model.transform(x_train)  # 返回训练集所选特征
    x_test_new = model.transform(x_test)  # 返回测试集所选特征

    print(x_train_new.shape)
    print(x_test_new.shape)

    # 创建成lgb特征的数据集格式
    lgb_train = lgb.Dataset(x_train_new, y_train)
    lgb_val = lgb.Dataset(x_test_new, y_test, reference=lgb_train)

    # 构建lightGBM模型
    params = {'max_depth': 6, 'min_data_in_leaf': 20, 'num_leaves': 35, 'learning_rate': 0.1, 'lambda_l1': 0.1,
              'lambda_l2': 0.2, 'objective': 'multiclass', 'num_class': 36, 'verbose': -1}
    # 设置迭代次数，默认为100，通常设置为100+
    num_boost_round = 2000
    # 训练lightGBM模型
    gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_val)

    # 保存模型到文件
    # gbm.save_model('data/lightGBM_model')

    # 预测数据集
    result = gbm.predict(x_test_new, num_iteration=gbm.best_iteration)
    y_predict = np.argmax(result, axis=1)  # 获得最大概率对应的标签

    label_all = [i for i in range(25)]
    confusion_mat = metrics.confusion_matrix(y_test, y_predict)
    df = pd.DataFrame(confusion_mat, columns=label_all)
    df.index = label_all

    print('准确率：', metrics.accuracy_score(y_test, y_predict))
    print('confusion_matrix:', df)
    print('分类报告:', metrics.classification_report(y_test, y_predict))

# 5 折投票融合

In [None]:
import numpy as np
import pandas as pd

output_file = 'submit/submit (5fold-base-attention-fgm-labeled-p_tuning16-bert).csv'

df0 = pd.read_csv('results/20/test_results_0.txt', header=None, names=['label'], encoding='utf8')
df1 = pd.read_csv('results/20/test_results_1.txt', header=None, names=['label'], encoding='utf8')
df2 = pd.read_csv('results/20/test_results_2.txt', header=None, names=['label'], encoding='utf8')
df3 = pd.read_csv('results/20/test_results_3.txt', header=None, names=['label'], encoding='utf8')
df4 = pd.read_csv('results/20/test_results_4.txt', header=None, names=['label'], encoding='utf8')
df5 = pd.read_csv('submit/submit (voting fusion 0.5885).csv', encoding='utf8')
df6 = pd.read_csv('submit/submit (voting fusion 0.5940).csv', encoding='utf8')
# df7 = pd.read_csv('results/18/test_results_5.txt', header=None, names=['label'], encoding='utf8')
# df8 = pd.read_csv('results/18/test_results_6.txt', header=None, names=['label'], encoding='utf8')
# df5 = pd.read_csv('results/8/test_results_0.txt', header=None, names=['label'], encoding='utf8')
# df6 = pd.read_csv('results/8/test_results_1.txt', header=None, names=['label'], encoding='utf8')
# df7 = pd.read_csv('results/8/test_results_2.txt', header=None, names=['label'], encoding='utf8')
# df8 = pd.read_csv('results/8/test_results_3.txt', header=None, names=['label'], encoding='utf8')
# df9 = pd.read_csv('results/8/test_results_4.txt', header=None, names=['label'], encoding='utf8')
# df10 = pd.read_csv('results/12/test_results_0.txt', header=None, names=['label'], encoding='utf8')
# df11 = pd.read_csv('results/12/test_results_1.txt', header=None, names=['label'], encoding='utf8')
# df12 = pd.read_csv('results/12/test_results_2.txt', header=None, names=['label'], encoding='utf8')
# df13 = pd.read_csv('results/12/test_results_3.txt', header=None, names=['label'], encoding='utf8')
# df14 = pd.read_csv('results/14/test_results_4.txt', header=None, names=['label'], encoding='utf8')
testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)

count = np.zeros((df0.shape[0], 36), np.int64)
df_out = pd.DataFrame(data=df0)
for i in range(df0.shape[0]):
    count[i][df0['label'].iloc[i]] += 1
    count[i][df1['label'].iloc[i]] += 1
    count[i][df2['label'].iloc[i]] += 1
    count[i][df3['label'].iloc[i]] += 1
    count[i][df4['label'].iloc[i]] += 1
    # count[i][df5['label'].iloc[i]] += 1
    # count[i][df6['label'].iloc[i]] += 1
    # count[i][df7['label'].iloc[i]] += 1
    # count[i][df8['label'].iloc[i]] += 1
    # count[i][df9['label'].iloc[i]] += 1
    df_out['label'].iloc[i] = np.argmax(count[i])

testA_df['label'] = df_out['label']
testA_df[['id', 'label']].to_csv(output_file, index=None)
df_out

In [None]:
import numpy as np
import pandas as pd

output_file = 'submit/submit (5fold-base-attention-fgm-labeled-3).csv'

df = pd.read_csv('results/test_results_3 (base + attention + fgm + labeled).txt', header=None, names=['label'],
                 encoding='utf8')
testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)
testA_df['label'] = df['label']
testA_df[['id', 'label']].to_csv(output_file, index=None)
df

# 结果相关性计算

In [37]:
import pandas as pd
import os


def correlation_calculating(file1: str, file2: str) -> list:
    result = []
    if file1.__contains__('txt'):
        result1 = pd.read_csv(file1, header=None, names=['label'], encoding='utf8')
    else:
        result1 = pd.read_csv(file1, encoding='utf8')
    if file2.__contains__('txt'):
        result2 = pd.read_csv(file2, header=None, names=['label'], encoding='utf8')
    else:
        result2 = pd.read_csv(file2, encoding='utf8')
    same, sum = result1[result1['label'] == result2['label']].shape[0], result1.shape[0]
    # result = result2[result1['label'] != result2['label']]
    result = pd.concat((result1[result1['label'] != result2['label']], result2[result1['label'] != result2['label']]), axis=1)
    correlation = same / sum

    return [result, correlation, same, sum, sum - same]


# correlation_calculating("results/35/test_results_3.txt", "results/36/test_results_0.txt"), correlation_calculating(
#     "results/35/test_results_1.txt", "results/36/test_results_0.txt"), correlation_calculating(
#     "results/35/test_results_3.txt", "results/36/test_results_1.txt"), correlation_calculating(
#     "results/35/test_results_1.txt", "results/36/test_results_1.txt"), correlation_calculating(
#     "results/35/test_results_3.txt", "results/36/test_results_2.txt"), correlation_calculating(
#     "results/35/test_results_1.txt", "results/36/test_results_2.txt"), correlation_calculating(
#     "results/35/test_results_3.txt", "results/36/test_results_3.txt"), correlation_calculating(
#     "results/35/test_results_1.txt", "results/36/test_results_3.txt")

# correlation_calculating("results/43/epoch3/test_results_0.txt",
#                         "results/43/epoch3/test_results_2.txt")
correlation_calculating("results/65/4/2e-5/ema/simplify/submit (6484_backtrans_sampling24_abstract(simplify)_4epochs_rdrop_bigru_2e-5_ema result_0 6495).csv", "results/66/5/3e-5/test_results_0.txt")

# correlation_calculating("submit/633/submit (voting fusion).csv",
#                         "submit/633/submit (6336oversampling_epoch4 result_4 6337).csv")

[                                     id  label  label
 17     75b887afe727475ec0db30ac5fde3690      5     21
 89     20bbe8f243a6b0e3b0f5e55f24e4d774      1     20
 249    4eff44f01234a4675df14565a1d04436      2      8
 349    ea25b5c6c23052f276e5826d0f0f4d95      3      0
 547    fd7e5da6c37db43c141a925d107177d1      2      8
 ...                                 ...    ...    ...
 20438  c83ab9db480503da0d83acd62650fdce     10      5
 20491  8110e88780ff31f04f8e8c99fdb49f72      2      8
 20554  b12534c71ea2c10bbd326c433e69388b     21     13
 20780  41c127efc9e3d54409b2d33d00e7998b     10      0
 20804  b53a7ab12ef020f2f57e1c7ca41d3b1d     10      2
 
 [204 rows x 3 columns],
 0.9902106626997457,
 20635,
 20839,
 204]

In [50]:
import numpy as np
df6495 = pd.read_csv("results/65/4/2e-5/ema/simplify/submit (6484_backtrans_sampling24_abstract(simplify)_4epochs_rdrop_bigru_2e-5_ema result_0 6495 post-precessing 6495).csv", sep=',', encoding='utf8')
df_test = pd.read_csv("results/66/5/3e-5/test_results_0.txt", sep=',', encoding='utf8', header=None, names=['label'])

list1 = np.asarray(df6495['label']).transpose().tolist()
list2 = np.asarray(df_test['label']).transpose().tolist()

results = []
count = 0

for i in range(len(list1)):
    if list2[i] == 8 and list1[i] != 8:
        list1[i] = 8
        count+=1

df6495['label'] = pd.DataFrame(data=list1, columns=['label'])
df6495.to_csv('results/65/4/2e-5/ema/simplify/submit (6484_backtrans_sampling24_abstract(simplify)_4epochs_rdrop_bigru_2e-5_ema result_0 6495 post-precessing 6495 copy).csv', encoding='utf8', index=False)
count

6

In [39]:
correlation_calculating("results/65/6/test_results_0_0.txt",
                        "results/66/6/submit (6495_backtrans_sampling24_abstract(simplify_title_new)_4epochs_rdrop_bigru_3e-5_ema result_0 6482).csv")

[       label                                id  label
 87         1  7598ac3194abfeb70918e417c876ab54     30
 375        8  d6a5fd1c4901505d570c4c5ef1383cb1      7
 425       29  e243e1f60c2c8985b3c1bdcd72c29276      2
 547        2  fd7e5da6c37db43c141a925d107177d1      8
 706       26  3f9595c0cc1ed04fbc826072373db48c      4
 ...      ...                               ...    ...
 20574      6  229e824079ce9996da06ba89d2e6ca0f     11
 20608     10  248b8390dff2fac2a5e5673a4b4fcace     12
 20683      5  bdc412d72ca3fcc33e9afca11edfcfed     21
 20746      5  d1cfc4aef268ca5c5f07b68af1ba31e7     26
 20789     21  5456e609da3b6319a8e854d3af805440     10
 
 [234 rows x 3 columns],
 0.9887710542732376,
 20605,
 20839,
 234]

In [19]:
correlation_calculating("results/66/5/2e-5/test_results_0.txt",
                        "results/65/1/submit (6484_sampling24(xlnet ratio)_abstract_4epochs_rdrop_bigru result_0 6455).csv")

[       label                                id  label
 3          2  06598dd8f3ab092acf2a55dce8be5621     16
 17        21  75b887afe727475ec0db30ac5fde3690      5
 58        11  df94e92d9f9826a204966ce63f45f716      2
 89        20  20bbe8f243a6b0e3b0f5e55f24e4d774      1
 121        2  4edfc37517d0875864abc42ca36a8ac6     10
 ...      ...                               ...    ...
 20771      4  b78d5075eb27d38d194b32bedf2ca118      0
 20774     33  b8585a055c5568d4725a71fe81799030      2
 20789     21  5456e609da3b6319a8e854d3af805440     10
 20801      9  74f2d642e2c365097756054b4e4d37aa      0
 20827      3  7eff0a22ffbf6cf516cafc88f34f2ebc      0
 
 [616 rows x 3 columns],
 0.970440040309036,
 20223,
 20839,
 616]

In [3]:
correlation_calculating("results/66/5/test_results_0.txt",
                        "results/66/3/submit (6495_backtrans_sampling24_abstract(simplify3)_4epochs_rdrop_bigru_4e-5_ema result_0 6463).csv")

[       label                                id  label
 3          2  06598dd8f3ab092acf2a55dce8be5621      6
 17        21  75b887afe727475ec0db30ac5fde3690      5
 89        20  20bbe8f243a6b0e3b0f5e55f24e4d774      1
 249        8  4eff44f01234a4675df14565a1d04436      2
 349        0  ea25b5c6c23052f276e5826d0f0f4d95      3
 ...      ...                               ...    ...
 20746      5  d1cfc4aef268ca5c5f07b68af1ba31e7     10
 20754      5  4c028ff61b80ab53a9ebafd147a49aa1      7
 20780      0  41c127efc9e3d54409b2d33d00e7998b     10
 20804      2  b53a7ab12ef020f2f57e1c7ca41d3b1d     10
 20809      5  3d1b889207881657a7f0e12b6cf447a5      7
 
 [252 rows x 3 columns],
 0.9879072892173328,
 20587,
 20839,
 252]

In [1]:
import pandas as pd
import numpy as np
import os


def correlation_calculating(file1: str, file2: str) -> list:
    result1 = pd.read_csv(file1, header=None, names=['label'], encoding='utf8')
    result2 = pd.read_csv(file2, header=None, names=['label'], encoding='utf8')
    same, sum = result1[result1['label'] == result2['label']].shape[0], result1.shape[0]
    correlation = same / sum

    return [correlation, same, sum]


def get_files(file_path='./submit') -> list:
    file_list = []

    for dirpath, dirnames, filenames in os.walk(file_path):
        for filename in filenames:
            file = os.path.join(dirpath, filename).replace('\\', '/')
            if filename.__contains__('.txt') or filename.__contains__('.csv1'):
                file_list.append(file)
    return file_list


def get_correlations(file_path='./submit', threshold=None, exclude_files=None) -> list:
    file_list = get_files(file_path)
    correlations = []

    if exclude_files is not None:
        for ef in exclude_files:
            if file_list.__contains__(ef):
                file_list.remove(ef)

    length = len(file_list)

    for i in range(length - 1):
        for j in range(i + 1, length):
            file1, file2 = file_list[i], file_list[j]
            correlation = correlation_calculating(file1, file2)[0]
            if threshold is None or correlation >= threshold:
                correlations.append((correlation, file1[-18:], file2[-18:]))
                # print(correlation, file1[16:-4], file2[16:-4], sep="\t")

    return correlations


def get_elem(elem):
    return elem[0]


res = get_correlations(file_path='./results/61/', exclude_files=['./submit/submit (tf-idf).csv'])
res.sort(reverse=True, key=get_elem)
pd.DataFrame(data=res, columns=['correlation', 'file1', 'file2'])

Unnamed: 0,correlation,file1,file2
0,0.999232,test_results_0.txt,test_results_3.txt
1,0.999184,test_results_0.txt,test_results_4.txt
2,0.99904,test_results_0.txt,test_results_2.txt
3,0.99904,test_results_2.txt,test_results_3.txt
4,0.99904,test_results_2.txt,test_results_4.txt
5,0.998992,test_results_3.txt,test_results_4.txt
6,0.998896,test_results_0.txt,test_results_1.txt
7,0.998848,test_results_1.txt,test_results_2.txt
8,0.998752,test_results_1.txt,test_results_3.txt
9,0.998704,test_results_1.txt,test_results_4.txt


# 提交结果投票融合

In [None]:
import pandas as pd
import numpy as np
import os


def get_files(file_path='./submit') -> list:
    file_list = []

    for dirpath, dirnames, filenames in os.walk(file_path):
        for filename in filenames:
            file = os.path.join(dirpath, filename).replace('\\', '/')
            if filename.__contains__('.txt') or filename.__contains__('.csv'):
                file_list.append(file)
    return file_list


def voting_fusion(files: list, output_file='./submit/submit (voting fusion).csv', exclude_files=None):
    if files is None or len(files) == 0:
        return None
    if exclude_files is not None and type(exclude_files) is list:
        for ef in exclude_files:
            if files.__contains__(ef):
                files.remove(ef)
    if files.__contains__(output_file):
        files.remove(output_file)

    print(files)

    df0 = pd.read_csv(files[0], encoding='utf8', header=None, names=['label']) if files[0].__contains__(
        'txt') else pd.read_csv(files[0], encoding='utf8')
    testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)
    count = np.zeros((df0.shape[0], 36), np.int64)
    df_out = pd.DataFrame(data=df0)

    for f in files:
        print('processing:', f)
        df = pd.read_csv(f, encoding='utf8', header=None, names=['label']) if f.__contains__('txt') else pd.read_csv(f,
                                                                                                                     encoding='utf8')
        for i in range(df.shape[0]):
            count[i][df['label'].iloc[i]] += 1
            df_out['label'].iloc[i] = np.argmax(count[i])

    count_list = np.asmatrix(count).tolist()
    with open('voting_matrix.txt', mode='w', encoding='utf8') as f:
        for c in count_list:
            f.write(str(c) + '\n')

    testA_df['label'] = df_out['label']
    testA_df[['id', 'label']].to_csv(output_file, index=None)
    print('融合结果已保存：', output_file)
    return output_file, count_list


file_list = get_files(file_path='./submit/64/')
voting_fusion(file_list, output_file='./submit/64/submit (voting fusion).csv',
              exclude_files=['../results/28/submit (6122 result_2 6122).csv'])

In [None]:
df1 = pd.read_csv('submit/submit (5fold + nezha-large-wwm + attention + fgm).csv', header=None, names=['label'],
                  encoding='utf8')
df2 = pd.read_csv('submit/submit (5fold-attention).csv', header=None, names=['label'], encoding='utf8')
df3 = pd.read_csv('submit/submit (5fold-attention).csv', header=None, names=['label'], encoding='utf8')

# 高置信度样本（伪标签）

In [None]:
import pandas as pd
import numpy as np

# 使用 5fold + nezha-large-wwm + attention + fgm 方法 且 预测概率在85%以上
ids = np.asarray(
    pd.read_csv("results/59/6/high_confidence_ids.txt", encoding='utf8', header=None, names=['high_confidence_id'])[
        'high_confidence_id']).transpose().tolist()
label = pd.read_csv("results/59/6/test_results.txt", encoding='utf8', header=None, names=['label'])
# testA_df = pd.read_json('./data/testA.json', encoding='utf8', lines=True)
testA_df['label'] = label['label']

with open('data/testA_labeled_6440.tsv', mode='w', encoding='utf8') as f:
    for i in ids:
        id = testA_df.loc[[i]].id.tolist()[0]
        title = testA_df.loc[[i]].title.tolist()[0]
        assignee = testA_df.loc[[i]].assignee.tolist()[0]
        abstract = testA_df.loc[[i]].abstract.tolist()[0]
        text = str(testA_df.loc[[i]].text.tolist()[0]).replace('<i>', ' ').replace('</i>', ' ')
        label = testA_df.loc[[i]].label.tolist()[0]
        f.write(str(label) + '\t' + text + '\n')

testA_df.loc[[ids[0]]]

In [5]:
import pandas as pd
import numpy as np
import re

pattern = '\s+|<i>|</i>|<br>'

testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)
label = pd.read_csv("results/65/4/2e-5/ema/simplify/submit (6484_backtrans_sampling24_abstract(simplify)_4epochs_rdrop_bigru_2e-5_ema result_0 6495 post-precessing 6495).csv", encoding='utf8')
testA_df['label'] = label['label']
testA_df['title'] = testA_df['title'].apply(lambda x: x.strip())
testA_df['abstract'] = testA_df['abstract'].fillna('').apply(lambda x: x.strip())
testA_df['text'] = testA_df['title'].str.lower() + '<sep>' + testA_df['abstract'].str.lower()
testA_df['text'] = testA_df['text'].replace('<i>|</i>', '', regex=True)
testA_df[['label', 'text']].to_csv('data/test/normal/testA_6495_post.tsv', index=None, sep='\t')

testA_df

Unnamed: 0,id,title,assignee,abstract,label,text
0,003fd481e65ddc070e38ae05002e16e2,一种耐磨、抗粘钢复合涂层、制备方法及应用,安徽马钢表面技术股份有限公司,本发明公开了一种耐磨、抗粘钢复合涂层、制备方法及应用，包括基体和基体上由内到外依次设置的过渡...,23,一种耐磨、抗粘钢复合涂层、制备方法及应用<sep>本发明公开了一种耐磨、抗粘钢复合涂层、制备...
1,549a1cd8228bd10f18395a0625fcc70d,一种用于提高橡胶抗湿滑性的树脂的制备方法及其应用,江苏麒祥高新材料有限公司,本发明公开了一种用于提高橡胶抗湿滑性的树脂的制备方法，第一步：将R树脂和B官能团化合物进行反...,5,一种用于提高橡胶抗湿滑性的树脂的制备方法及其应用<sep>本发明公开了一种用于提高橡胶抗湿滑...
2,f09c4c0332f8966400e06f4def9f1a6d,有机硅改性丙烯酸树脂超亲水防雾涂料及其制作方法,重庆大学,本发明涉及涂料制造领域，本发明公开了一种含有磺酸季铵盐的有机硅改性丙烯酸树脂超亲水低温防雾涂...,5,有机硅改性丙烯酸树脂超亲水防雾涂料及其制作方法<sep>本发明涉及涂料制造领域，本发明公开了...
3,06598dd8f3ab092acf2a55dce8be5621,一种空调系统及其控制方法、控制装置,海尔智家股份有限公司,本发明涉及空调领域，公开了一种空调系统，包括室外机和太阳能供热系统，所述太阳能供热系统包括：...,2,一种空调系统及其控制方法、控制装置<sep>本发明涉及空调领域，公开了一种空调系统，包括室外...
4,e70177ba6a54d08abecd80a60fdd9f52,资源申请、分配方法，UE及网络控制单元,中兴通讯股份有限公司,本发明实施例公开了一种资源申请方法及装置，所述方法包括：向网络控制单元发送低时延业务信息；接...,0,资源申请、分配方法，ue及网络控制单元<sep>本发明实施例公开了一种资源申请方法及装置，所...
...,...,...,...,...,...,...
20834,befab80c8c6cf6f8db5a4ee3b9e22020,由低合金碳钢制成的螺钉和制造该螺钉的方法,伊卓特有限两合公司,本发明涉及一种螺钉，该螺钉具有头部、邻接的保持段和功能端。所述螺钉用于自攻螺钉。所述功能端的...,23,由低合金碳钢制成的螺钉和制造该螺钉的方法<sep>本发明涉及一种螺钉，该螺钉具有头部、邻接的...
20835,b41abe927240b1ab73b1cb0fca2d9970,一种铸造铝合金及其制备方法,中国兵器工业第五九研究所,本发明提供了一种铸造铝合金及其制备方法，铸造铝合金成分包括：Si：7.5～8.5%、Cu：2...,13,一种铸造铝合金及其制备方法<sep>本发明提供了一种铸造铝合金及其制备方法，铸造铝合金成分包...
20836,c0fd3051ce51166e80b9922c97e2f7a4,一种显示面板及生成随机图块坐标的方法,上海天马微电子有限公司,本发明提供一种显示面板，显示面板的图块具有随机排布的特性，能够消除由于周期性排布带来的鬼影问...,10,一种显示面板及生成随机图块坐标的方法<sep>本发明提供一种显示面板，显示面板的图块具有随机...
20837,4c89a2b2bd405456e316a35411297b0f,一种确定机器类通信下行控制信道重复次数的方法及基站,电信科学技术研究院,本发明实施例涉及无线通信技术领域，特别涉及一种确定机器类通信下行控制信道重复次数的方法及基站...,0,一种确定机器类通信下行控制信道重复次数的方法及基站<sep>本发明实施例涉及无线通信技术领域...


In [2]:
import pandas as pd
import numpy as np

pattern = '\s+|<i>|</i>|<br>'

testA_df = pd.read_json('data/testB.json', encoding='utf8', lines=True)
label = pd.read_csv("results/B/1(testA6495)/test_results_0.txt", encoding='utf8', header=None, names=['label'])
testA_df['label'] = label['label']
testA_df['title'] = testA_df['title'].apply(lambda x: x.strip())
testA_df['abstract'] = testA_df['abstract'].fillna('').apply(lambda x: x.strip())
testA_df['text'] = testA_df['title'].str.lower() + '<sep>' + testA_df['abstract'].str.lower()
testA_df['text'] = testA_df['text'].replace('<i>|</i>', '', regex=True)
testA_df[['label', 'text']].to_csv('data/test/normal/testB_6495.tsv', index=None, sep='\t')

testA_df

Unnamed: 0,id,title,assignee,abstract,label,text
0,487fdc38d8b39d35ba18206b835e57e3,建立灵活以太网路径的方法和网络设备,华为技术有限公司,本申请提供了一种建立FlexE路径的方法和网络设备，能够降低节点的控制面的管理复杂度和信令开...,0,建立灵活以太网路径的方法和网络设备<sep>本申请提供了一种建立flexe路径的方法和网络设...
1,32e267e4a7bc6bd000fcd45616e448d7,一种程序代码标记方法及装置,阿里巴巴集团控股有限公司,本申请公开了一种程序代码标记方法及装置，该方法包括：顺序读取程序代码，当读取到程序代码中包含...,4,一种程序代码标记方法及装置<sep>本申请公开了一种程序代码标记方法及装置，该方法包括：顺序...
2,9b808e8060a5e84886e4fb1bf94699d4,一种新型环境保护净化器,李菊红,本实用新型适用于环境保护技术领域，提供了一种新型环境保护净化器，包括机体组件、驱动组件和过滤...,7,一种新型环境保护净化器<sep>本实用新型适用于环境保护技术领域，提供了一种新型环境保护净化...
3,1e5366849744046a289799101f74494c,一种具有屏蔽功能的密闭门铰链,江苏龙腾门业有限公司,本实用新型公开了一种具有屏蔽功能的密闭门铰链，包括门边框，门边框上部通过螺栓固定有两个上铰链...,2,一种具有屏蔽功能的密闭门铰链<sep>本实用新型公开了一种具有屏蔽功能的密闭门铰链，包括门边...
4,7827bbef8b8090da1e47beb46713d086,透明高温蒸煮膜用聚丙烯组合物及其制备方法,中国石油化工股份有限公司,本发明涉及一种透明高温蒸煮膜用聚丙烯组合物及其制备方法，由如下重量份数的原料制成：抗冲共聚聚...,5,透明高温蒸煮膜用聚丙烯组合物及其制备方法<sep>本发明涉及一种透明高温蒸煮膜用聚丙烯组合物...
...,...,...,...,...,...,...
20885,fc494f75e1db775918129eecdf2cef91,一种无人机带状正射影像航测方法及系统,广州中科云图智能科技有限公司,本发明涉及一种无人机带状正射影像航测方法及系统，包括获取航测起始点、拐角点和结束点的坐标位置...,6,一种无人机带状正射影像航测方法及系统<sep>本发明涉及一种无人机带状正射影像航测方法及系统...
20886,c890d6119d344124b21612a6b350ae55,星载双通道多频段可选上变频装置,上海航天测控通信研究所,本发明提供了一种星载双通道多频段可选上变频装置，包括开关选择模块、上变频模块、综合接口模块和...,26,星载双通道多频段可选上变频装置<sep>本发明提供了一种星载双通道多频段可选上变频装置，包括...
20887,56095253b1b9ec1ab9fa91ec16bfff81,船用管道铺设摩擦夹紧装置,伊特里克公司,本发明涉及一种用于摩擦保持投放到海中的离岸海底管道(30)的船用管道铺设摩擦夹紧装置(10；...,33,船用管道铺设摩擦夹紧装置<sep>本发明涉及一种用于摩擦保持投放到海中的离岸海底管道(30)...
20888,a088ab6b6a8d43e90e1c181abc31411b,一种河道污染底泥生态修复方法,山东建筑大学,本发明的河道污染底泥生态修复方法，是在河道岸边设置底泥堆放区，对底泥堆放区的底部及四周进行防...,7,一种河道污染底泥生态修复方法<sep>本发明的河道污染底泥生态修复方法，是在河道岸边设置底泥...


# 数据增强（后处理）

In [3]:
labels = ['农业', '基因', '养殖', '信息', '材料', '化学', '智能', '算法', '环保', '医疗', '电子', '降解', '回收', '光伏', '风电', '航空', '航天', '新能源', '环境', "污染", "纳米"]
print(labels)

['农业', '基因', '养殖', '信息', '材料', '化学', '智能', '算法', '环保', '医疗', '电子', '降解', '回收', '光伏', '风电', '航空', '航天', '新能源', '环境', '污染', '纳米']


In [40]:
import pandas as pd
import numpy as np

train_df = pd.read_json('data/train.json', encoding='utf8', lines=True)
title, abstract = train_df['title'], train_df['abstract']

for label in labels:
    print('*' * 20 + label + '*' * 20)
    print(np.asarray(train_df[train_df['abstract'].str.contains(label)]['label_id'].sort_values()).transpose().tolist())

for i in range(len(labels) - 1):
    for j in range(i + 1, len(labels)):
        results = np.asarray(train_df[train_df['abstract'].str.contains(labels[i]) & train_df['abstract'].str.contains(labels[j])]['label_id'].sort_values()).transpose().tolist()
        if len(results) > 0:
            print('*' * 20 + labels[i] + '&' + labels[j] + '*' * 20)
            print(results)

********************农业********************
[1, 2, 7, 11]
********************基因********************
[1, 1, 1, 1, 1, 1, 8, 8, 20, 20, 20, 30]
********************养殖********************
[1, 2, 2, 2, 7]
********************信息********************
[0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9, 10, 10, 11, 11, 15, 15, 15, 19, 19, 24, 24, 26, 27, 27, 27, 32, 32, 32, 32, 34, 35, 35, 35, 35, 35]
********************材料********************
[1, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 7, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 15, 16, 18, 18, 18, 18, 18, 20, 20, 21, 21, 21, 21, 21, 21, 21, 22, 23, 23, 24, 24, 25, 25, 25, 25, 30]
********************化学********************
[1, 2, 5, 5, 5, 7, 8, 10, 13, 13, 13, 14, 16, 18, 20, 20, 22, 23, 23, 23, 23, 23, 23, 23, 25, 

In [41]:
import pandas as pd
import numpy as np

testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)
submit_df = pd.read_csv('results/65/4/2e-5/ema/simplify/submit (6484_backtrans_sampling24_abstract(simplify)_4epochs_rdrop_bigru_2e-5_ema result_0 6495).csv', encoding='utf8')

for label in labels:
    print('*' * 20 + label + '*' * 20)
    print(np.asarray(submit_df[testA_df['abstract'].str.contains(label)]['label'].sort_values()).transpose().tolist())

for i in range(len(labels) - 1):
    for j in range(i + 1, len(labels)):
        results = np.asarray(submit_df[testA_df['abstract'].str.contains(labels[i]) & testA_df['abstract'].str.contains(labels[j])]['label'].sort_values()).transpose().tolist()
        if len(results) > 0:
            print('*' * 20 + labels[i] + '&' + labels[j] + '*' * 20)
            print(results)

********************农业********************
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 6, 6, 6, 7, 7, 7, 7, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 14, 14, 16, 18, 26, 30]
********************基因********************
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [5]:
import pandas as pd
import numpy as np

testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)
# submit_df = pd.read_csv('results/65/4/2e-5/ema/simplify/submit (6484_backtrans_sampling24_abstract(simplify)_4epochs_rdrop_bigru_2e-5_ema result_0 6495).csv', encoding='utf8')
submit_df = pd.read_csv('results/66/6/submit (6495_backtrans_sampling24_abstract(simplify_title_new)_4epochs_rdrop_bigru_3e-5_ema result_0).csv', encoding='utf8')

# submit_df[testA_df['abstract'].str.contains('航空') & ~testA_df['abstract'].str.contains('等行业') & ~testA_df['abstract'].str.contains('航天')]

submit_df[testA_df['abstract'].str.contains('回收') & testA_df['abstract'].str.contains('降解') & ~testA_df['abstract'].str.contains('污水')]

# abstracts = np.asarray(testA_df['abstract']).transpose().tolist()
# count = 0
# for i in range(len(abstracts)):
#     if str(abstracts[i]).__contains__('光伏'):
#         submit_df['label'].iloc[i] = 16
#         count += 1
# submit_df.to_csv('submit/submit (6453 药).csv', encoding='utf8', index=False)
# count

Unnamed: 0,id,label
1379,dfd6fcfe04ac4d9a42fd6a3b806271a7,14
3776,25fce71aa5b5e66c142f66bc7fe32f2b,14
4163,6af1438c9ac2133864da20ae14684261,14
5179,ab70475f4304acb9c664b6bfaf7e9ccc,14
8588,8603f27b4583cb48a1d86ed287d499f3,14
9162,54c4deddb11162c46e131c42c6b816ef,7
10215,0c59e65650e75c9a25caff6520c1e108,14
14970,a060a210e933524b4f60f98a90eb7059,14
15573,145a381a3627a4cff22d3b198fa2b4a9,7
17103,1c3d82c32db277b4d08c94d5bdf26895,7


# 结果输出

In [4]:
import pandas as pd

testA_df = pd.read_json('./data/testA.json', encoding='utf8', lines=True)
testA_df['label'] = pd.read_csv('results/66/6/test_results_0.txt', encoding='utf8', header=None, names=['label'])['label']
testA_df[['id', 'label']].to_csv('results/66/6/submit (6495_backtrans_sampling24_abstract(simplify_title_new)_4epochs_rdrop_bigru_3e-5_ema result_0).csv', index=None)

In [1]:
import pandas as pd

testA_df = pd.read_json('./data/testB.json', encoding='utf8', lines=True)
testA_df['label'] = pd.read_csv('results/B/5(sampling49+50+2)/test_results_0.txt', encoding='utf8', header=None, names=['label'])['label']
testA_df[['id', 'label']].to_csv('results/B/5(sampling49+50+2)/submit (test5_sampling49+50+2).csv', index=None)