# Test model

### original file

https://github.com/openai/gpt-2-output-dataset/blob/master/baseline.py

## Before running this notebook

1. Create /output folder
   1. Insert all crawled dataset(csv)
   1. Rename them as same with GPT dataset files.
1. Create /log folder

## Difference between baseline

- Using Word2Vec instead of TF-IDF Vectorizer
- ~~Using Gridsearch to find best parameter~~

In [37]:
!pip install gensim
!pip install scikit-learn

In [38]:
# import packages

import os
import csv
import json

import numpy as np

from scipy import sparse

from sklearn.model_selection import PredefinedSplit, GridSearchCV, train_test_split, KFold
from sklearn.linear_model import LogisticRegression

from gensim.models import Word2Vec

In [39]:
# create tokenizer
# example code from https://github.com/SKT-AI/KoGPT2

from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token="</s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [40]:
SHORT_MODE = True

In [41]:
# load data + preprocessing

def load_data(data_dir, crawled_dir, source):
    path = os.path.join(data_dir, "{}.csv".format(source))
    crawled_path = os.path.join(crawled_dir, "{}.csv".format(source))
    dataset = list(csv.reader(open(path, encoding="utf8")))
    crawled_dataset = list(csv.reader(open(crawled_path, encoding="cp949")))
    n = len(dataset)
    
    length = min(50, n) if SHORT_MODE else n

    texts = []
    labels = [1, 0] * length

    for data in dataset[:length]:
        idx = int(round(float(data[0])))
        tokens = tokenizer.tokenize(data[5])
        texts.append(' '.join(tokens))
        tokens = tokenizer.tokenize(crawled_dataset[idx][4])
        texts.append(' '.join(tokens))
    return texts, labels

In [42]:
# main function

def main(
    data_dir="data/",
    crawl_dir="output/",
    log_dir="log/",
    topics=["culture", "economy", "it_science", "politics", "society", "world"],
    train_test_ratio=0.1,
):
    texts_list, labels_list = [], []
    for topic in topics:
        texts, labels = load_data(data_dir, crawl_dir, topic)
        texts_list.extend(texts)
        labels_list.extend(labels)

    texts_train, texts_test, labels_train, labels_test = train_test_split(
        texts_list, labels_list, test_size=train_test_ratio, random_state=42, shuffle=True,
    )
    n_train, n_test = len(texts_train), len(texts_test)
    
    w2v_model = Word2Vec(sentences=texts_list, vector_size=512, window=9, min_count=5, workers=4, sg=0)
    train_features = [w2v_model.wv.get_mean_vector(y) for y in texts_train]
    test_features = [w2v_model.wv.get_mean_vector(y) for y in texts_test]

    model = LogisticRegression(max_iter=100, penalty=None)
    # params = {'C': [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4, 8, 16, 32, 64]}
    # split = PredefinedSplit([-1]*n_train+[0]*n_test)
    # search = GridSearchCV(model, params, cv=split, refit=False)
    # search.fit(sparse.vstack([train_features, test_features]), labels_train + labels_test)
    # model = model.set_params(**search.best_params_)
    model.fit(train_features, labels_train)
    test_accuracy = model.score(test_features, labels_test) * 100.0

    result = model.predict(test_features)
    result_proba = model.predict_proba(test_features)
    result_log_proba = model.predict_log_proba(test_features)
    kind = {"tp": 0, "fp": 0, "fn": 0, "tn": 0}
    for res, pred in zip(result, labels_test):
        if res == 1:
            kind["tp" if res == pred else "fp"] += 1
        else:
            kind["tn" if res == pred else "fn"] += 1
    precision = kind["tp"] / (kind["tp"] + kind["fp"])
    recall = kind["tp"] / (kind["tp"] + kind["fn"])

    ce_loss = 0
    for label, value in zip(labels_test, result_log_proba):
        ce_loss -= label * value[1] + (1 - label) * value[0]
    ce_loss /= len(labels_test)
    data = {
        "test_accuracy": test_accuracy,
        "test_precision": precision,
        "test_recall": recall,
        "F_score": 2 * precision * recall / (precision + recall),
        "mse_loss": np.sum(np.array(labels_test) - model.predict_proba(test_features)[:, 1]) ** 2 / len(labels_test),
        "ce_loss": ce_loss,
        # "param": search.best_params_,
        "label_and_result": list(zip(labels_test, model.predict(test_features).tolist(), result_proba[:, 1].tolist())),
    }
    print(data)
    json.dump(data, open(os.path.join(log_dir, "result.json"), "w"), indent=4)

In [43]:
# main function

def main_overfit(
    data_dir="data/",
    crawl_dir="output/",
    log_dir="log/",
    topics=["culture", "economy", "it_science", "politics", "society", "world"],
    train_test_ratio=0.1,
):
    texts_list, labels_list = [], []
    for topic in topics:
        texts, labels = load_data(data_dir, crawl_dir, topic)
        texts_list.extend(texts)
        labels_list.extend(labels)

    texts_train, texts_test, labels_train, labels_test = train_test_split(
        texts_list, labels_list, test_size=train_test_ratio, random_state=42, shuffle=True,
    )
    
    w2v_model = Word2Vec(sentences=texts_list, vector_size=512, window=9, min_count=5, workers=4, sg=0)
    train_features = [w2v_model.wv.get_mean_vector(y) for y in texts_train]
    test_features = [w2v_model.wv.get_mean_vector(y) for y in texts_test]

    model = LogisticRegression(max_iter=10000, penalty=None)
    model.fit(train_features, labels_train)
    test_accuracy = model.score(test_features, labels_test) * 100.0

    result = model.predict(test_features)
    result_proba = model.predict_proba(test_features)
    result_log_proba = model.predict_log_proba(test_features)
    kind = {"tp": 0, "fp": 0, "fn": 0, "tn": 0}
    for res, pred in zip(result, labels_test):
        if res == 1:
            kind["tp" if res == pred else "fp"] += 1
        else:
            kind["tn" if res == pred else "fn"] += 1
    precision = kind["tp"] / (kind["tp"] + kind["fp"])
    recall = kind["tp"] / (kind["tp"] + kind["fn"])

    ce_loss = 0
    for label, value in zip(labels_test, result_log_proba):
        ce_loss -= label * value[1] + (1 - label) * value[0]
    ce_loss /= len(labels_test)
    data = {
        "test_accuracy": test_accuracy,
        "test_precision": precision,
        "test_recall": recall,
        "F_score": 2 * precision * recall / (precision + recall),
        "mse_loss": np.sum(np.array(labels_test) - model.predict_proba(test_features)[:, 1]) ** 2 / len(labels_test),
        "ce_loss": ce_loss,
        "label_and_result": list(zip(labels_test, model.predict(test_features).tolist(), result_proba[:, 1].tolist())),
    }
    print(data)
    json.dump(data, open(os.path.join(log_dir, "result_overfit.json"), "w"), indent=4)

In [44]:
# main function

def main_kfold(
    data_dir="data/",
    crawl_dir="output/",
    log_dir="log/",
    topics=["culture", "economy", "it_science", "politics", "society", "world"],
):
    texts_list, labels_list = [], []
    for topic in topics:
        texts, labels = load_data(data_dir, crawl_dir, topic)
        texts_list.extend(texts)
        labels_list.extend(labels)
    
    w2v_model = Word2Vec(sentences=texts_list, vector_size=512, window=9, min_count=5, workers=4, sg=0)
    
    
    kf = KFold(n_splits=5)
    
    for idx, (train, test) in enumerate(kf.split(texts_list)):
        texts_train, texts_test = [texts_list[x] for x in train], [texts_list[x] for x in test]
        labels_train, labels_test = [labels_list[x] for x in train], [labels_list[x] for x in test]

        n_train, n_test = len(train), len(test)
        
        train_features = [w2v_model.wv.get_mean_vector(y) for y in texts_train]
        test_features = [w2v_model.wv.get_mean_vector(y) for y in texts_test]

        model = LogisticRegression(max_iter=100, penalty=None)
        params = {'C': [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4, 8, 16, 32, 64]}
        split = PredefinedSplit([-1]*n_train+[0]*n_test)
        search = GridSearchCV(model, params, cv=split, refit=False)
        search.fit(sparse.vstack([train_features, test_features]), labels_train + labels_test)
        model = model.set_params(**search.best_params_)
        model.fit(train_features, labels_train)
        test_accuracy = model.score(test_features, labels_test) * 100.0

        result = model.predict(test_features)
        result_proba = model.predict_proba(test_features)
        result_log_proba = model.predict_log_proba(test_features)
        kind = {"tp": 0, "fp": 0, "fn": 0, "tn": 0}
        for res, pred in zip(result, labels_test):
            if res == 1:
                kind["tp" if res == pred else "fp"] += 1
            else:
                kind["tn" if res == pred else "fn"] += 1
        precision = kind["tp"] / (kind["tp"] + kind["fp"])
        recall = kind["tp"] / (kind["tp"] + kind["fn"])

        ce_loss = 0
        for label, value in zip(labels_test, result_log_proba):
            ce_loss -= label * value[1] + (1 - label) * value[0]
        ce_loss /= len(labels_test)
        data = {
            "test_accuracy": test_accuracy,
            "test_precision": precision,
            "test_recall": recall,
            "F_score": 2 * precision * recall / (precision + recall),
            "mse_loss": np.sum(np.array(labels_test) - model.predict_proba(test_features)[:, 1]) ** 2 / len(labels_test),
            "ce_loss": ce_loss,
            # "param": search.best_params_,
            "label_and_result": list(zip(labels_test, model.predict(test_features).tolist(), result_proba[:, 1].tolist())),
        }
        print(data)
        json.dump(data, open(os.path.join(log_dir, "result{}.json".format(idx)), "w"), indent=4)

In [45]:
# run main function

main(train_test_ratio=0.2)
# main_overfit(train_test_ratio=0.2)
# main_kfold()

{'test_accuracy': 89.16666666666667, 'test_precision': 0.9473684210526315, 'test_recall': 0.84375, 'F_score': 0.8925619834710744, 'mse_loss': 0.35444012352634696, 'ce_loss': 0.3021937717745798, 'label_and_result': [(1, 1, 0.9717779710384378), (0, 0, 0.05764387732338032), (0, 0, 0.05758253763773014), (0, 0, 0.00011709378260378344), (0, 0, 0.10689998625846188), (1, 1, 0.9918503036207272), (1, 0, 0.16363114075239638), (0, 0, 0.11278845696993474), (1, 1, 0.5166689326057543), (0, 0, 0.0016667985830455777), (0, 0, 0.002895797770067001), (1, 0, 0.11169616995427373), (0, 0, 0.030519480357694227), (1, 1, 0.9859243212264455), (0, 0, 0.010549105097666498), (1, 1, 0.9926380361104022), (1, 1, 0.9346488503155645), (1, 1, 0.950851003355753), (1, 1, 0.954515626684332), (1, 1, 0.924166347719938), (1, 1, 0.9912134410603858), (0, 0, 0.0041674234961938976), (0, 0, 0.11495959955237615), (1, 1, 0.9999484508567833), (1, 1, 0.9084537144742204), (0, 0, 0.06987334370455268), (1, 1, 0.9958249815501198), (0, 0, 0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
