In [None]:
pip install git+https://www.github.com/bojone/bert4keras.git

In [None]:
!pip install wget

In [None]:
!pip install transformers

In [None]:

import json
import numpy as np
import pandas as pd
from random import choice
import re, os
import codecs
from bert4keras.bert import load_pretrained_model, set_gelu
from bert4keras.train import PiecewiseLinearLearningRate
from bert4keras import *
from bert4keras.backend import *
from fastai.text import *
from collections import OrderedDict, defaultdict
import wget

import zipfile

from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import KFold

from transformers import *

from keras.callbacks import Callback
from sklearn.metrics import f1_score

In [None]:
ls ../input/preprocessed-fake-news

In [None]:
class DataLoader():
    
    def __init__(self, df, column="text", batch_size=8, train=True):
        
        self.batch_size = batch_size
        
        if train:
            df = df[["id", "label", column]].sample(frac=1., random_state=42).reset_index(drop=True)
        else:
            df = df[["id", column]].sample(frac=1., random_state=42).reset_index(drop=True)
        
        df['ids'] = df[column].map(self.get_ids)
        self.df = df.rename(columns={column:"text"})
        
        if train:
            self.train_step, self.valid_step = len(df)*0.8//batch_size, len(df)*0.2//batch_size
            self.train_step += len(df)*0.8%batch_size!=0
            self.valid_step += len(df)*0.2%batch_size!=0
            
            kf = KFold(5, True, 42)

            fake_df, true_df = df[df.label==1], df[df.label==0]

            fake_train_folders, fake_valid_folders = [], []
            for train_index, valid_index in kf.split(fake_df):
                fake_train_folders.append(fake_df.iloc[train_index])
                fake_valid_folders.append(fake_df.iloc[valid_index])

            true_train_folders, true_valid_folders = [], []
            for train_index, valid_index in kf.split(true_df):
                true_train_folders.append(true_df.iloc[train_index])
                true_valid_folders.append(true_df.iloc[valid_index])

            folders = []
            for i in range(5):
                train_df = fake_train_folders[i].append(true_train_folders[i], ignore_index=True).sample(frac=1)
                valid_df = fake_valid_folders[i].append(true_valid_folders[i], ignore_index=True).sample(frac=1)
                folders.append((train_df, valid_df))
            self.folders = folders
    
    def get_ids(self, text):
        return [101] + tokenizer.encode(text)[:383]
        
    def __getitem__(self, i):
        train_df, valid_df = self.folders[i]
        return self.data_generator(train_df), self.data_generator(valid_df)
    
    def data_generator(self, data):
        while True:
            for i, g in data.groupby(np.arange(len(data))//self.batch_size):
                ids = pad_sequences(g.ids.values, padding='post')
                segment_ids = np.zeros((ids.shape), dtype=np.uint32)
                yield [ids, segment_ids], g.label.values
            
    def get_prediction(self, model, col_name):
        new_df = defaultdict(list)
        for i, e in self.df.iterrows():
            new_df['id'].append(e.id)
            ids = np.array(e.ids)[None]
            segment_ids = np.zeros((ids.shape), dtype=np.uint32)
            y_pred = model.predict([ids, segment_ids])
            new_df[col_name].append(y_pred[0][0])
        return pd.DataFrame(new_df).sort_values('id')
        

In [None]:


def build_model(name='albert_base', keep_words=None):
    
    set_gelu('tanh') # 切换gelu版本

    albert_links =  OrderedDict(albert_xlarge = "https://storage.googleapis.com/albert_zh/albert_xlarge_zh.zip",
                    albert_large = "https://storage.googleapis.com/albert_zh/albert_large_zh.zip",
                    albert_base  = "https://storage.googleapis.com/albert_zh/albert_base_zh.zip",
                    vocab = "https://github.com/brightmart/albert_zh/blob/master/albert_config/vocab.txt")
    config_paths = OrderedDict(albert_xlarge= "./albert_xlarge/albert_config_xlarge.json",
                               albert_large = "./albert_large/albert_config_large.json",
                               albert_base = "./albert_base/albert_config_base.json")
    
    if not os.path.isdir(name):
        os.mkdir(name)
        wget.download(albert_links[name], name+'.zip')
        with zipfile.ZipFile(name+".zip", 'r') as zip_ref:
            zip_ref.extractall(name)
            os.remove(name+'.zip')

    model = load_pretrained_model(
        config_paths[name],
        name+'/albert_model.ckpt',
        keep_words=keep_words,
        albert=True
    )

    output = Lambda(lambda x: x[:, 0])(model.output)
    output = Dense(1, activation='sigmoid')(output)
    model = Model(model.input, output)

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(1e-5),
        metrics=['accuracy']
    )
    # model.summary()
    
    return model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [None]:
data_path = Path("../input/preprocessed-fake-news")
train_df = pd.read_csv(data_path/"train_unstructed_remove_number_data.csv", encoding="utf8")
test_df = pd.read_csv(data_path/"test_unstructed_remove_number_data.csv", encoding="utf8")
train_df.head()

In [None]:
test_df.head()

In [None]:
chinese_text_columns = ['text', 'pre_20_words', 'first_sentence', 'last_centence', 're_translate']

# 'albert_xlarge' is too large
models = ['albert_large', 'albert_base']

In [None]:
train_data_loader = DataLoader(train_df)
test_data_loader  = DataLoader(test_df, train=False)

In [None]:

train_result = None
test_result = None

for model_name in models[1:2]:
    
    model = build_model(model_name)
    
    for i in range(5):
        
        train_g, valid_g = train_data_loader[i]
        
        model.fit_generator(
            train_g,
            steps_per_epoch=train_data_loader.train_step,
            epochs=2,
            validation_data=valid_g,
            validation_steps=train_data_loader.valid_step
        )
        
        train_predictions = train_data_loader.get_prediction(model, f"{model_name}_{i}")
        if train_result:
            train_result = pd.concat([train_result, train_predictions], axis=1).drop('id', axis=1)
        else:
            train_result = train_predictions
        
        test_predictions = test_data_loader.get_prediction(model, f"{model_name}_{i}")
        if test_result:
            test_result = pd.concat([test_result, test_predictions], axis=1).drop('id', axis=1)
        else:
            test_result = test_predictions

In [None]:
train_result.to_csv("train_result.csv", index=False)
test_result.to_csv("test_result.csv", index=False)