# TF-IDF

In [None]:
#安装相关依赖库 如果是windows系统，cmd命令框中输入pip安装，参考上述环境配置
#!pip install sklearn
#!pip install pandas
#---------------------------------------------------
#导入库
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

train_df = pd.read_json('./data/train.json', encoding='utf8', lines=True)
testA_df = pd.read_json('./data/testA.json', encoding='utf8', lines=True)

#----------------特征工程----------------
#将论文的标题与摘要组合为 text 特征
train_df['title'] = train_df['title'].apply(lambda x: x.strip())
train_df['abstract'] = train_df['abstract'].fillna('').apply(lambda x: x.strip())
train_df['text'] = train_df['title'].str.lower() + '[SEP]' + train_df['abstract'].str.lower()

testA_df['title'] = testA_df['title'].apply(lambda x: x.strip())
testA_df['abstract'] = testA_df['abstract'].fillna('').apply(lambda x: x.strip())
testA_df['text'] = testA_df['title'].str.lower() + '[SEP]' + testA_df['abstract'].str.lower()

#使用tfidf算法做文本特征提取
tfidf = TfidfVectorizer(max_features=2500)

#----------------模型训练----------------

train_tfidf = tfidf.fit_transform(train_df['text'])
clf = SGDClassifier()
cross_val_score(clf, train_tfidf, train_df['label_id'], cv=5)

test_tfidf = tfidf.transform(testA_df['text'])
clf = SGDClassifier()
clf.fit(train_tfidf, train_df['label_id'])
testA_df['label'] = clf.predict(test_tfidf)

#----------------结果输出----------------
testA_df[['id', 'label']].to_csv('submit/submit (tf-idf).csv', index=None)

In [None]:
# coding=utf-8
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


# 训练集中词频统计，并计算TF值
def words_tf():
    train_data = pd.read_csv('data/train.tsv', sep='\t').astype(str)
    sentence_list = []
    lenth = len(train_data)
    for i in range(lenth):
        sentence_list.append(str(train_data['text'][i]).split())
    # 总词频统计
    doc_frequency = defaultdict(int)
    for word_list in sentence_list:
        for i in word_list:
            doc_frequency[i] += 1

    # 计算每个词的TF值
    word_tf = {}  # 存储每个词的tf值
    for i in doc_frequency:
        word_tf[i] = doc_frequency[i] / sum(doc_frequency.values())

    words_tf = sorted(word_tf.items(), key=lambda x: x[1], reverse=True)
    return words_tf[:10000]


# 根据词频，将文本转换为向量
def word2vec(keywords_tf, doc_sentence):
    keywords = list(dict(keywords_tf).keys())  # 获取关键词
    tf_weight = list(dict(keywords_tf).values())  # 获取关键词tf值

    docvec_list = []
    for sentence in doc_sentence:
        docvec = [0] * len(keywords_tf)
        for word in sentence:
            if word in keywords:
                docvec[keywords.index(word)] = tf_weight[keywords.index(word)]
        docvec_list.append(docvec)
    return docvec_list


# 将训练集和测试集换为文本向量
def doc_vec(x_train, x_test):
    keywords_tf = words_tf()  # 获取词频关键词

    # 训练集转换为向量
    train_lenth = len(x_train)
    train_data_list = []
    for i in range(train_lenth):
        train_data_list.append(str(x_train[i]).split())
    train_docvec_list = word2vec(keywords_tf, train_data_list)

    # 测试集转换为向量
    test_lenth = len(x_test)
    test_data_list = []
    for i in range(test_lenth):
        test_data_list.append(str(x_test[i]).split())
    test_docvec_list = word2vec(keywords_tf, test_data_list)

    return train_docvec_list, test_docvec_list


if __name__ == '__main__':
    train_data = pd.read_csv('data/train.tsv', sep='\t').astype(str)
    x_train, x_test, y_train, y_test = train_test_split(train_data['text'], train_data['label_id'], test_size=0.05)

    x_train = np.array(x_train)
    x_test = np.array(x_test)
    cw = lambda x: int(x)
    y_train = np.array(y_train.apply(cw))
    y_test = np.array(y_test.apply(cw))

    x_train, x_test = doc_vec(x_train, x_test)  # 训练集和测试集向量化
    x_train, y_train = shuffle(x_train, y_train, random_state=0)  # 打乱顺序

    # 导入SelectFromModel结合ExtraTreesClassifier计算特征重要性，并按重要性阈值选择特征。
    clf_model = ExtraTreesClassifier(n_estimators=250, random_state=0)
    # clf_model=RandomForestClassifier(n_estimators=250,random_state=0)
    clf_model.fit(x_train, y_train)
    # 获取每个词的特征权重,数值越高特征越重要l
    importances = clf_model.feature_importances_

    '''
    # 将词和词的权重存入字典并写入文件
    feature_words_dic = {}
    for i in range(len(words_list)):
        feature_words_dic[words_list[i][0]] = importances[i]
    # 对字典按权重由大到小进行排序
    words_info_dic_sort = sorted(feature_words_dic.items(), key=lambda x: x[1], reverse=True)
    #将前2000个词的权重字典写入文件
    key_words_importance=dict(words_info_dic_sort[:2000])
    with open('data/key_words_importance','w') as f:
        f.write(str(key_words_importance))
    '''

    # 选择特征重要性为1.5倍均值的特征
    model = SelectFromModel(clf_model, threshold='1.5*mean', prefit=True)
    x_train_new = model.transform(x_train)  # 返回训练集所选特征
    x_test_new = model.transform(x_test)  # 返回测试集所选特征

    print(x_train_new.shape)
    print(x_test_new.shape)

    # 创建成lgb特征的数据集格式
    lgb_train = lgb.Dataset(x_train_new, y_train)
    lgb_val = lgb.Dataset(x_test_new, y_test, reference=lgb_train)

    # 构建lightGBM模型
    params = {'max_depth': 6, 'min_data_in_leaf': 20, 'num_leaves': 35, 'learning_rate': 0.1, 'lambda_l1': 0.1,
              'lambda_l2': 0.2, 'objective': 'multiclass', 'num_class': 36, 'verbose': -1}
    # 设置迭代次数，默认为100，通常设置为100+
    num_boost_round = 2000
    # 训练lightGBM模型
    gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_val)

    # 保存模型到文件
    # gbm.save_model('data/lightGBM_model')

    # 预测数据集
    result = gbm.predict(x_test_new, num_iteration=gbm.best_iteration)
    y_predict = np.argmax(result, axis=1)  # 获得最大概率对应的标签

    label_all = [i for i in range(25)]
    confusion_mat = metrics.confusion_matrix(y_test, y_predict)
    df = pd.DataFrame(confusion_mat, columns=label_all)
    df.index = label_all

    print('准确率：', metrics.accuracy_score(y_test, y_predict))
    print('confusion_matrix:', df)
    print('分类报告:', metrics.classification_report(y_test, y_predict))

# 5 折投票融合

In [5]:
import numpy as np
import pandas as pd

output_file = 'submit/submit (5fold-base-attention-fgm-labeled-p_tuning16-bert).csv'

df0 = pd.read_csv('results/20/test_results_0.txt', header=None, names=['label'], encoding='utf8')
df1 = pd.read_csv('results/20/test_results_1.txt', header=None, names=['label'], encoding='utf8')
df2 = pd.read_csv('results/20/test_results_2.txt', header=None, names=['label'], encoding='utf8')
df3 = pd.read_csv('results/20/test_results_3.txt', header=None, names=['label'], encoding='utf8')
df4 = pd.read_csv('results/20/test_results_4.txt', header=None, names=['label'], encoding='utf8')
df5 = pd.read_csv('submit/submit (voting fusion 0.5885).csv', encoding='utf8')
df6 = pd.read_csv('submit/submit (voting fusion 0.5940).csv', encoding='utf8')
# df7 = pd.read_csv('results/18/test_results_5.txt', header=None, names=['label'], encoding='utf8')
# df8 = pd.read_csv('results/18/test_results_6.txt', header=None, names=['label'], encoding='utf8')
# df5 = pd.read_csv('results/8/test_results_0.txt', header=None, names=['label'], encoding='utf8')
# df6 = pd.read_csv('results/8/test_results_1.txt', header=None, names=['label'], encoding='utf8')
# df7 = pd.read_csv('results/8/test_results_2.txt', header=None, names=['label'], encoding='utf8')
# df8 = pd.read_csv('results/8/test_results_3.txt', header=None, names=['label'], encoding='utf8')
# df9 = pd.read_csv('results/8/test_results_4.txt', header=None, names=['label'], encoding='utf8')
# df10 = pd.read_csv('results/12/test_results_0.txt', header=None, names=['label'], encoding='utf8')
# df11 = pd.read_csv('results/12/test_results_1.txt', header=None, names=['label'], encoding='utf8')
# df12 = pd.read_csv('results/12/test_results_2.txt', header=None, names=['label'], encoding='utf8')
# df13 = pd.read_csv('results/12/test_results_3.txt', header=None, names=['label'], encoding='utf8')
# df14 = pd.read_csv('results/14/test_results_4.txt', header=None, names=['label'], encoding='utf8')
testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)

count = np.zeros((df0.shape[0], 36), np.int64)
df_out = pd.DataFrame(data=df0)
for i in range(df0.shape[0]):
    count[i][df0['label'].iloc[i]] += 1
    count[i][df1['label'].iloc[i]] += 1
    count[i][df2['label'].iloc[i]] += 1
    count[i][df3['label'].iloc[i]] += 1
    count[i][df4['label'].iloc[i]] += 1
    # count[i][df5['label'].iloc[i]] += 1
    # count[i][df6['label'].iloc[i]] += 1
    # count[i][df7['label'].iloc[i]] += 1
    # count[i][df8['label'].iloc[i]] += 1
    # count[i][df9['label'].iloc[i]] += 1
    df_out['label'].iloc[i] = np.argmax(count[i])

testA_df['label'] = df_out['label']
testA_df[['id', 'label']].to_csv(output_file, index=None)
df_out

Unnamed: 0,label
0,23
1,5
2,5
3,16
4,0
...,...
20834,2
20835,13
20836,10
20837,0


In [None]:
import numpy as np
import pandas as pd

output_file = 'submit/submit (5fold-base-attention-fgm-labeled-3).csv'

df = pd.read_csv('results/test_results_3 (base + attention + fgm + labeled).txt', header=None, names=['label'],
                 encoding='utf8')
testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)
testA_df['label'] = df['label']
testA_df[['id', 'label']].to_csv(output_file, index=None)
df

# 结果相关性计算

In [1]:
import pandas as pd
import os


def correlation_calculating(file1: str, file2: str) -> list:
    result = []
    if file1.__contains__('txt'):
        result1 = pd.read_csv(file1, header=None, names=['label'], encoding='utf8')
    else:
        result1 = pd.read_csv(file1, encoding='utf8')
    if file2.__contains__('txt'):
        result2 = pd.read_csv(file2, header=None, names=['label'], encoding='utf8')
    else:
        result2 = pd.read_csv(file2, encoding='utf8')
    same, sum = result1[result1['label'] == result2['label']].shape[0], result1.shape[0]
    # result = result2[result1['label'] != result2['label']]
    result = pd.concat((result1[result1['label'] != result2['label']], result2[result1['label'] != result2['label']]), axis=1)
    correlation = same / sum

    return [result, correlation, same, sum, sum - same]


# correlation_calculating("results/35/test_results_3.txt", "results/36/test_results_0.txt"), correlation_calculating(
#     "results/35/test_results_1.txt", "results/36/test_results_0.txt"), correlation_calculating(
#     "results/35/test_results_3.txt", "results/36/test_results_1.txt"), correlation_calculating(
#     "results/35/test_results_1.txt", "results/36/test_results_1.txt"), correlation_calculating(
#     "results/35/test_results_3.txt", "results/36/test_results_2.txt"), correlation_calculating(
#     "results/35/test_results_1.txt", "results/36/test_results_2.txt"), correlation_calculating(
#     "results/35/test_results_3.txt", "results/36/test_results_3.txt"), correlation_calculating(
#     "results/35/test_results_1.txt", "results/36/test_results_3.txt")

# correlation_calculating("results/43/epoch3/test_results_0.txt",
#                         "results/43/epoch3/test_results_2.txt")
correlation_calculating("results/59/4/submit (6440_split_epoch4_lr4e-5 result_4 manually 6440).csv",
                        "results/59/6/test_results_0.txt")

# correlation_calculating("submit/633/submit (voting fusion).csv",
#                         "submit/633/submit (6336oversampling_epoch4 result_4 6337).csv")

[                                     id  label  label
 423    8ec314f4958834efa866983154c942a3      2     33
 892    71122c339f20cd2c76dd573771979af4      8     10
 2048   da4868469de7b4cb6010ee083c917e84     20      5
 3358   1f256a467d438225c825ae30c7696541      1      2
 4161   4d6d6989352fb9e5dacde47b9370c3e5     20      5
 4452   09c81f349f9b8b18bf12a8bb998a31ac     31     13
 7105   c7a0538008b5ead69b0bf487ef37c8e2     20      5
 8193   9cde217da66268a493fa37dc1af9eac8      8     12
 8809   9895615c288b518ef321557f60cb3f25     17     10
 9124   852f9edcfd1596e76ade40fb862f490a      6     34
 10463  56c340e85ba38f75c72e8a087c114117     16      2
 12965  e86fc78f2d8dfc5215054aa4f970724c     20      5
 13151  f10d745f92262bdb009ccc45693a8729     20      5
 13579  d681c2f07781c61d3d3d9acb4aaf2eb4      1     20
 16041  419b9b5f429e66d2c8b40cf384b90f23     11      2
 19048  ac602c21880b8d8449ad30e261db484b     10     21
 20294  fe4c7b794424bdba715d40970a6c866a     18     16,
 0.999184

In [5]:
correlation_calculating("results/59/1/submit (6440_split_epoch4 result_0 manually 6437).csv",
                        "results/59/3/test_results_0.txt")

[                                     id  label  label
 423    8ec314f4958834efa866983154c942a3      2     33
 1372   a91e3c247ab15c33446dee123fc2fbb9     11      2
 1498   4003afee5069a719fac4e0591e4fdf03      6     11
 1840   16f83ac0ed66c561cc74568371a2c996     23     13
 2675   d2bc0409b1daa1f09a17d092b9bebbce      3      4
 2708   81a55ec1d6d68af6d8d31aee21dc2b18      6      4
 2894   24971b118ecba9cabf6d1a63b74619c0     11      2
 3089   5409d32e6037f8afca4e05063b9cfb94     13     18
 3247   a9cbe17620f86adc951519a7e5e45b51     28      2
 3318   87d4d9426d58a6af5c2bb0b92ce9bdeb      9     10
 3794   93eeb93bf899e116c172ca40d6bc5e80      0      4
 4508   c0f7b112baeaacd066c7c7143e80c779     14      7
 6582   fbf90e8d02aa4b78c647f58ec20a055c      9     10
 6667   b58a784d6ce704293ea18a409810131b      2     11
 6748   4a40c302fc436df1ef9cf6ac6c7ae132     14      7
 6925   fd4702d55866c1460fe443ee2125070d     14     18
 7220   9c73bb3e0675a3c0d8fecabd11f85f02     11      2
 7655   98

In [60]:
import pandas as pd
import numpy as np
import os


def correlation_calculating(file1: str, file2: str) -> list:
    result1 = pd.read_csv(file1, header=None, names=['label'], encoding='utf8')
    result2 = pd.read_csv(file2, header=None, names=['label'], encoding='utf8')
    same, sum = result1[result1['label'] == result2['label']].shape[0], result1.shape[0]
    correlation = same / sum

    return [correlation, same, sum]


def get_files(file_path='./submit') -> list:
    file_list = []

    for dirpath, dirnames, filenames in os.walk(file_path):
        for filename in filenames:
            file = os.path.join(dirpath, filename).replace('\\', '/')
            if filename.__contains__('.txt') or filename.__contains__('.csv1'):
                file_list.append(file)
    return file_list


def get_correlations(file_path='./submit', threshold=None, exclude_files=None) -> list:
    file_list = get_files(file_path)
    correlations = []

    if exclude_files is not None:
        for ef in exclude_files:
            if file_list.__contains__(ef):
                file_list.remove(ef)

    length = len(file_list)

    for i in range(length - 1):
        for j in range(i + 1, length):
            file1, file2 = file_list[i], file_list[j]
            correlation = correlation_calculating(file1, file2)[0]
            if threshold is None or correlation >= threshold:
                correlations.append((correlation, file1[-18:], file2[-18:]))
                # print(correlation, file1[16:-4], file2[16:-4], sep="\t")

    return correlations


def get_elem(elem):
    return elem[0]


res = get_correlations(file_path='./results/40/', exclude_files=['./submit/submit (tf-idf).csv'])
res.sort(reverse=True, key=get_elem)
pd.DataFrame(data=res, columns=['correlation', 'file1', 'file2'])

Unnamed: 0,correlation,file1,file2
0,0.994146,test_results_0.txt,test_results_3.txt
1,0.994098,test_results_2.txt,test_results_3.txt
2,0.994098,test_results_3.txt,test_results_4.txt
3,0.993858,test_results_0.txt,test_results_2.txt
4,0.993858,test_results_1.txt,test_results_2.txt
5,0.993858,test_results_2.txt,test_results_4.txt
6,0.993618,test_results_0.txt,test_results_4.txt
7,0.993618,test_results_1.txt,test_results_4.txt
8,0.993474,test_results_0.txt,test_results_1.txt
9,0.99333,test_results_1.txt,test_results_3.txt


# 提交结果投票融合

In [7]:
import pandas as pd
import numpy as np
import os


def get_files(file_path='./submit') -> list:
    file_list = []

    for dirpath, dirnames, filenames in os.walk(file_path):
        for filename in filenames:
            file = os.path.join(dirpath, filename).replace('\\', '/')
            if filename.__contains__('.txt') or filename.__contains__('.csv'):
                file_list.append(file)
    return file_list


def voting_fusion(files: list, output_file='./submit/submit (voting fusion).csv', exclude_files=None):
    if files is None or len(files) == 0:
        return None
    if exclude_files is not None and type(exclude_files) is list:
        for ef in exclude_files:
            if files.__contains__(ef):
                files.remove(ef)
    if files.__contains__(output_file):
        files.remove(output_file)

    print(files)

    df0 = pd.read_csv(files[0], encoding='utf8', header=None, names=['label']) if files[0].__contains__(
        'txt') else pd.read_csv(files[0], encoding='utf8')
    testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)
    count = np.zeros((df0.shape[0], 36), np.int64)
    df_out = pd.DataFrame(data=df0)

    for f in files:
        print('processing:', f)
        df = pd.read_csv(f, encoding='utf8', header=None, names=['label']) if f.__contains__('txt') else pd.read_csv(f,
                                                                                                                     encoding='utf8')
        for i in range(df.shape[0]):
            count[i][df['label'].iloc[i]] += 1
            df_out['label'].iloc[i] = np.argmax(count[i])

    count_list = np.asmatrix(count).tolist()
    with open('voting_matrix.txt', mode='w', encoding='utf8') as f:
        for c in count_list:
            f.write(str(c) + '\n')

    testA_df['label'] = df_out['label']
    testA_df[['id', 'label']].to_csv(output_file, index=None)
    print('融合结果已保存：', output_file)
    return output_file, count_list


file_list = get_files(file_path='./submit/633/')
voting_fusion(file_list, output_file='./submit/633/submit (voting fusion).csv',
              exclude_files=['../results/28/submit (6122 result_2 6122).csv',
                             './submit/633/submit (voting fusion).csv'])

['./submit/633/submit (63176_epoch3 result_0 6334).csv', './submit/633/submit (6334oversampling_epoch3 result_0 6336).csv', './submit/633/submit (6336oversampling_epoch4 result_0 6336).csv', './submit/633/submit (6336oversampling_epoch4 result_4 6337).csv']
processing: ./submit/633/submit (63176_epoch3 result_0 6334).csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out['label'].iloc[i] = np.argmax(count[i])


processing: ./submit/633/submit (6334oversampling_epoch3 result_0 6336).csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out['label'].iloc[i] = np.argmax(count[i])


processing: ./submit/633/submit (6336oversampling_epoch4 result_0 6336).csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out['label'].iloc[i] = np.argmax(count[i])


processing: ./submit/633/submit (6336oversampling_epoch4 result_4 6337).csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out['label'].iloc[i] = np.argmax(count[i])


融合结果已保存： ./submit/633/submit (voting fusion).csv


('./submit/633/submit (voting fusion).csv',
 [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   4,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   4,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   4,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   4,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [4,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0

In [None]:
df1 = pd.read_csv('submit/submit (5fold + nezha-large-wwm + attention + fgm).csv', header=None, names=['label'],
                  encoding='utf8')
df2 = pd.read_csv('submit/submit (5fold-attention).csv', header=None, names=['label'], encoding='utf8')
df3 = pd.read_csv('submit/submit (5fold-attention).csv', header=None, names=['label'], encoding='utf8')

# 高置信度样本（伪标签）

In [None]:
import pandas as pd
import numpy as np

# 使用 5fold + nezha-large-wwm + attention + fgm 方法 且 预测概率在85%以上
ids = np.asarray(
    pd.read_csv("results/high_confidence_ids.txt", encoding='utf8', header=None, names=['high_confidence_id'])[
        'high_confidence_id']).transpose().tolist()
label = pd.read_csv("results/test_results.txt", encoding='utf8', header=None, names=['label'])
testA_df['label'] = label['label']

with open('data/testA_labeled.tsv', mode='w', encoding='utf8') as f:
    for i in ids:
        id = testA_df.loc[[i]].id.tolist()[0]
        title = testA_df.loc[[i]].title.tolist()[0]
        assignee = testA_df.loc[[i]].assignee.tolist()[0]
        abstract = testA_df.loc[[i]].abstract.tolist()[0]
        text = str(testA_df.loc[[i]].text.tolist()[0]).replace('<i>', ' ').replace('</i>', ' ')
        label = testA_df.loc[[i]].label.tolist()[0]
        f.write(str(label) + '\t' + text + '\n')

testA_df.loc[[ids[0]]]

In [9]:
import pandas as pd
import numpy as np
import re

pattern = '\s+|<i>|</i>|<br>'

label = pd.read_csv("results/59/4/submit (6440_split_epoch4_lr4e-5 result_4 manually 6440).csv", encoding='utf8')
testA_df['label'] = label['label']
testA_df['title'] = testA_df['title'].apply(lambda x: x.strip())
testA_df['abstract'] = testA_df['abstract'].fillna('').apply(lambda x: x.strip())
testA_df['text'] = testA_df['title'].str.lower() + '<sep>' + testA_df['abstract'].str.lower()
testA_df['text'] = testA_df['text'].replace('<i>|</i>', '', regex=True)
testA_df[['label', 'text']].to_csv('data/test/testA_6440_59_4.tsv', index=None, sep='\t')

testA_df

Unnamed: 0,id,title,assignee,abstract,label,text
0,003fd481e65ddc070e38ae05002e16e2,一种耐磨、抗粘钢复合涂层、制备方法及应用,安徽马钢表面技术股份有限公司,本发明公开了一种耐磨、抗粘钢复合涂层、制备方法及应用，包括基体和基体上由内到外依次设置的过渡...,23,一种耐磨、抗粘钢复合涂层、制备方法及应用<sep>本发明公开了一种耐磨、抗粘钢复合涂层、制备...
1,549a1cd8228bd10f18395a0625fcc70d,一种用于提高橡胶抗湿滑性的树脂的制备方法及其应用,江苏麒祥高新材料有限公司,本发明公开了一种用于提高橡胶抗湿滑性的树脂的制备方法，第一步：将R树脂和B官能团化合物进行反...,5,一种用于提高橡胶抗湿滑性的树脂的制备方法及其应用<sep>本发明公开了一种用于提高橡胶抗湿滑...
2,f09c4c0332f8966400e06f4def9f1a6d,有机硅改性丙烯酸树脂超亲水防雾涂料及其制作方法,重庆大学,本发明涉及涂料制造领域，本发明公开了一种含有磺酸季铵盐的有机硅改性丙烯酸树脂超亲水低温防雾涂...,5,有机硅改性丙烯酸树脂超亲水防雾涂料及其制作方法<sep>本发明涉及涂料制造领域，本发明公开了...
3,06598dd8f3ab092acf2a55dce8be5621,一种空调系统及其控制方法、控制装置,海尔智家股份有限公司,本发明涉及空调领域，公开了一种空调系统，包括室外机和太阳能供热系统，所述太阳能供热系统包括：...,16,一种空调系统及其控制方法、控制装置<sep>本发明涉及空调领域，公开了一种空调系统，包括室外...
4,e70177ba6a54d08abecd80a60fdd9f52,资源申请、分配方法，UE及网络控制单元,中兴通讯股份有限公司,本发明实施例公开了一种资源申请方法及装置，所述方法包括：向网络控制单元发送低时延业务信息；接...,0,资源申请、分配方法，ue及网络控制单元<sep>本发明实施例公开了一种资源申请方法及装置，所...
...,...,...,...,...,...,...
20834,befab80c8c6cf6f8db5a4ee3b9e22020,由低合金碳钢制成的螺钉和制造该螺钉的方法,伊卓特有限两合公司,本发明涉及一种螺钉，该螺钉具有头部、邻接的保持段和功能端。所述螺钉用于自攻螺钉。所述功能端的...,23,由低合金碳钢制成的螺钉和制造该螺钉的方法<sep>本发明涉及一种螺钉，该螺钉具有头部、邻接的...
20835,b41abe927240b1ab73b1cb0fca2d9970,一种铸造铝合金及其制备方法,中国兵器工业第五九研究所,本发明提供了一种铸造铝合金及其制备方法，铸造铝合金成分包括：Si：7.5～8.5%、Cu：2...,13,一种铸造铝合金及其制备方法<sep>本发明提供了一种铸造铝合金及其制备方法，铸造铝合金成分包...
20836,c0fd3051ce51166e80b9922c97e2f7a4,一种显示面板及生成随机图块坐标的方法,上海天马微电子有限公司,本发明提供一种显示面板，显示面板的图块具有随机排布的特性，能够消除由于周期性排布带来的鬼影问...,9,一种显示面板及生成随机图块坐标的方法<sep>本发明提供一种显示面板，显示面板的图块具有随机...
20837,4c89a2b2bd405456e316a35411297b0f,一种确定机器类通信下行控制信道重复次数的方法及基站,电信科学技术研究院,本发明实施例涉及无线通信技术领域，特别涉及一种确定机器类通信下行控制信道重复次数的方法及基站...,0,一种确定机器类通信下行控制信道重复次数的方法及基站<sep>本发明实施例涉及无线通信技术领域...


# 数据增强

In [12]:
import pandas as pd
import numpy as np

testA_df = pd.read_json('data/testA.json', encoding='utf8', lines=True)
submit_df = pd.read_csv('submit/submit (voting fusion 0.5940).csv', encoding='utf8')
abstracts = np.asarray(testA_df['abstract']).transpose().tolist()
count = 0
for i in range(len(abstracts)):
    if str(abstracts[i]).__contains__('光伏'):
        submit_df['label'].iloc[i] = 16
        count += 1
submit_df.to_csv('submit/submit (光伏).csv', encoding='utf8', index=False)
count

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit_df['label'].iloc[i] = 16


119

# 结果输出

In [6]:
import pandas as pd

testA_df = pd.read_json('./data/testA.json', encoding='utf8', lines=True)
testA_df['label'] = pd.read_csv('results/59/5/test_results_0.txt', encoding='utf8', header=None, names=['label'])['label']
testA_df[['id', 'label']].to_csv('results/59/5/submit (6440_59_4_split_epoch3_lr4e-5 result_0).csv', index=None)