In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import jieba
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from ark_nlp.model.tc.bert import Bert
from ark_nlp.model.tc.bert import BertConfig
from ark_nlp.model.tc.bert import Dataset
from ark_nlp.model.tc.bert import Task
from ark_nlp.model.tc.bert import get_default_model_optimizer
from ark_nlp.model.tc.bert import Tokenizer

from sklearn.model_selection import train_test_split

In [None]:
# 目录地址
train_data_path = '../data/source_datasets/train.csv'

### 一、数据读入与处理

#### 1. 数据读入

In [None]:
train_data_df = pd.read_csv(train_data_path)

In [None]:
def get_text(_df):
    if pd.isna(_df['name']):
        title = '无'
    else:
        title = _df['name']
        
    if pd.isna(_df['content']):
        content = '无'
    else:
        content = _df['content']
        
    return '标题：' + title + ';' + '内容：' + content

train_data_df['text'] = train_data_df.apply(lambda x: get_text(x), axis=1)

In [None]:
train_data_df, dev_data_df = train_test_split(train_data_df, test_size=0.2, random_state=42)

In [None]:
train_data_df = (train_data_df
                 .loc[:,['text', 'label']])

dev_data_df = (dev_data_df
                 .loc[:,['text', 'label']])

In [None]:
tc_train_dataset = Dataset(train_data_df)
tc_dev_dataset = Dataset(dev_data_df)

#### 2. 词典创建和生成分词器

In [None]:
tokenizer = Tokenizer(vocab='nghuyong/ernie-1.0', max_seq_len=100)

#### 3. ID化

In [None]:
tc_train_dataset.convert_to_ids(tokenizer)
tc_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [None]:
config = BertConfig.from_pretrained(
    'nghuyong/ernie-1.0',
    num_labels=len(tc_train_dataset.cat2id)
)

#### 2. 模型创建

In [None]:
torch.cuda.empty_cache()

In [None]:
dl_module = Bert.from_pretrained(
    'nghuyong/ernie-1.0',
    config=config
)

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [None]:
# 设置运行次数
num_epoches = 10
batch_size = 32

In [None]:
optimizer = get_default_model_optimizer(dl_module)

#### 2. 任务创建

In [None]:
model = Task(dl_module, optimizer, 'lsce', cuda_device=0)

#### 3. 训练

In [None]:
model.fit(tc_train_dataset, 
          tc_dev_dataset,
          lr=2e-5,
          epochs=6, 
          batch_size=batch_size
         )

<br>

### 四、预测提交

In [None]:
from tqdm import tqdm
from ark_nlp.model.tc.bert import Predictor

In [None]:
tc_predictor_instance = Predictor(model.module, tokenizer, tc_train_dataset.cat2id)

In [None]:
test_data_path = '../data/source_datasets/testa_nolabel.csv'

In [None]:
test_data_df = pd.read_csv(test_data_path)

test_data_df['text'] = test_data_df.apply(lambda x: get_text(x), axis=1)

In [None]:
_predict_labels = []
for _id, _text in tqdm(zip(test_data_df['id'], test_data_df['text'])):
    _predict_labels.append([_id, tc_predictor_instance.predict_one_sample(_text)[0]])

In [None]:
submit_df = pd.DataFrame(_predict_labels, columns=['id', 'label'])

In [None]:
submit_df.to_csv('../data/output_datasets/submita.csv', index=None)