# Test model

### original file

https://github.com/openai/gpt-2-output-dataset/blob/master/baseline.py

## Before running this notebook

1. Create /output folder
   1. Insert all crawled dataset(csv)
   1. Rename them as same with GPT dataset files.
1. Create /log folder

## Difference between baseline

- Using Word2Vec instead of TF-IDF Vectorizer
- Using Gridsearch to find best parameter

In [None]:
!pip install gensim
!pip install scikit-learn

In [7]:
# import packages

import os
import csv
import json

import numpy as np

from scipy import sparse

from sklearn.model_selection import PredefinedSplit, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression

from gensim.models import Word2Vec

In [8]:
# create tokenizer
# example code from https://github.com/SKT-AI/KoGPT2

from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token="</s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [9]:
SHORT_MODE = True

In [10]:
# load data + preprocessing

def load_data(data_dir, crawled_dir, source):
    path = os.path.join(data_dir, "{}.csv".format(source))
    crawled_path = os.path.join(crawled_dir, "{}.csv".format(source))
    dataset = list(csv.reader(open(path, encoding="utf8")))
    crawled_dataset = list(csv.reader(open(crawled_path, encoding="cp949")))
    n = len(dataset)
    
    length = min(50, n) if SHORT_MODE else n

    texts = []
    labels = [1, 0] * length

    for data in dataset[:length]:
        idx = int(round(float(data[0])))
        tokens = tokenizer.tokenize(data[5])
        texts.append(' '.join(tokens))
        tokens = tokenizer.tokenize(crawled_dataset[idx][4])
        texts.append(' '.join(tokens))
    return texts, labels

In [11]:
# main function

def main(
    data_dir="data/",
    crawl_dir="output/",
    log_dir="log/",
    topics=["culture", "economy", "it_science", "politics", "society", "world"],
    train_test_ratio=0.1,
):
    texts_list, labels_list = [], []
    for topic in topics:
        texts, labels = load_data(data_dir, crawl_dir, topic)
        texts_list.extend(texts)
        labels_list.extend(labels)

    texts_train, texts_test, labels_train, labels_test = train_test_split(
        texts_list, labels_list, test_size=train_test_ratio, random_state=42, shuffle=True,
    )
    n_train, n_test = len(texts_train), len(texts_test)
    
    w2v_model = Word2Vec(sentences=texts_list, vector_size=512, window=9, min_count=5, workers=4, sg=0)
    train_features = [w2v_model.wv.get_mean_vector(y) for y in texts_train]
    test_features = [w2v_model.wv.get_mean_vector(y) for y in texts_test]

    model = LogisticRegression(max_iter=100000)
    params = {'C': [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4, 8, 16, 32, 64]}
    split = PredefinedSplit([-1]*n_train+[0]*n_test)
    search = GridSearchCV(model, params, cv=split, refit=False)
    search.fit(sparse.vstack([train_features, test_features]), labels_train + labels_test)
    model = model.set_params(**search.best_params_)
    model.fit(train_features, labels_train)
    test_accuracy = model.score(test_features, labels_test) * 100.0

    result = model.predict(test_features)
    result_proba = model.predict_proba(test_features)
    result_log_proba = model.predict_log_proba(test_features)
    kind = {"tp": 0, "fp": 0, "fn": 0, "tn": 0}
    for res, pred in zip(result, labels_test):
        if res == 1:
            kind["tp" if res == pred else "fp"] += 1
        else:
            kind["tn" if res == pred else "fn"] += 1
    precision = kind["tp"] / (kind["tp"] + kind["fp"])
    recall = kind["tp"] / (kind["tp"] + kind["fn"])

    ce_loss = 0
    for label, value in zip(labels_test, result_log_proba):
        ce_loss -= label * value[1] + (1 - label) * value[0]
    ce_loss /= len(labels_test)
    data = {
        "test_accuracy": test_accuracy,
        "test_precision": precision,
        "test_recall": recall,
        "F_score": 2 * precision * recall / (precision + recall),
        "mse_loss": np.sum(np.array(labels_test) - model.predict_proba(test_features)[:, 1]) ** 2 / len(labels_test),
        "ce_loss": ce_loss,
        "param": search.best_params_,
        "label_and_result": list(zip(labels_test, model.predict(test_features).tolist(), result_proba[:, 1].tolist())),
    }
    print(data)
    json.dump(data, open(os.path.join(log_dir, "result.json"), "w"), indent=4)

In [12]:
# run main function

main()

{'test_accuracy': 86.66666666666667, 'test_precision': 0.8846153846153846, 'test_recall': 0.8214285714285714, 'F_score': 0.8518518518518519, 'mse_loss': 0.009032847198068643, 'ce_loss': 0.3901438404922017, 'param': {'C': 32}, 'label_and_result': [(1, 1, 0.5220390441391777), (0, 0, 0.27780202346772925), (0, 0, 0.19010829523195444), (0, 0, 0.036286407124581324), (0, 0, 0.31283326407949036), (1, 1, 0.8557177160855992), (1, 1, 0.5706967561540156), (0, 0, 0.3858658115295685), (1, 1, 0.8623425795253546), (0, 0, 0.04365645630591507), (0, 0, 0.1710444105191018), (1, 0, 0.2812877871917825), (0, 0, 0.37724226909525094), (1, 1, 0.9243185904469908), (0, 0, 0.106459015791026), (1, 1, 0.6549948466342912), (1, 1, 0.6426966882101939), (1, 1, 0.8084979214455724), (1, 1, 0.8210372259403137), (1, 0, 0.28243457582245884), (1, 1, 0.6029871131871425), (0, 0, 0.1586683598355534), (0, 0, 0.23769169997512996), (1, 1, 0.9168999718960049), (1, 0, 0.4583817905211404), (0, 0, 0.31885253449872386), (1, 1, 0.8747115