In [1]:
from utils.training_io import load_log, load_vectors
import os
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from scipy.sparse import hstack
import json
from tqdm import tqdm

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore", category=ConvergenceWarning)

In [3]:
results_dir = "imdb_trainsize_experiment"
vectors_fn = "imdb_vectors_full_1e-03_120epoch_p30.jsonl"
log_fn = "imdb_log_full_1e-03_120epoch_p30.txt"

log = load_log(os.path.join(results_dir, log_fn))
X_train, y_train, X_dev, y_dev, X_test, y_test = load_vectors(os.path.join(results_dir, vectors_fn))

In [4]:
docs_dir = "IMDB_splits"
doc_dirs = ["train.jsonl", "dev.jsonl", "test.jsonl"]
docs_train, docs_dev, docs_test = [], [], []
docs = [docs_train, docs_dev, docs_test]
for i, doc_dir in enumerate(doc_dirs):
    with open(os.path.join(docs_dir, doc_dirs[i])) as f:
        for line in f:
            d = json.loads(line)
            docs[i].append(d["text"])


In [5]:
train_size_list = [20, 200, 2000, 20000, 60, 30, 10, 20, 200, 2000, 60, 30, 10]
repeat_times_list = [10, 10, 10, 1, 10, 10, 10, 20, 20, 20, 20, 20, 20]
Cs = np.logspace(-1, 9, 21)
multipliers = np.logspace(0, 2, 9) # on DV in the concatenation with BON
random_seed = 2
result_fn = "imdb_trainsize_results.json"
sampled_inds_fn = "imdb_trainsize_sampled_inds.json"

In [6]:
res = {
    "id": [], "train_size": [], "C": [], "train_acc": [], "dev_acc": [], "test_acc": [], "model": [], "multiplier": []
    }
sampled_inds = {}

In [7]:
def get_nb_bon(docs_train, docs_dev, docs_test, y_train, bon_vectorizer):
    n = len(docs_train)
    if n > 1000:
        bon_vectorizer.set_params(min_df=3)
    else:
        bon_vectorizer.set_params(min_df=2)
    bon_train = bon_vectorizer.fit_transform(docs_train)
    bon_dev = bon_vectorizer.transform(docs_dev)
    bon_test = bon_vectorizer.transform(docs_test)

    nb = BernoulliNB()
    nb.fit(bon_train, y_train)
    prob = nb.feature_log_prob_
    r = np.abs(prob[0] - prob[1])
    bon_train, bon_dev, bon_test = map(lambda x: x.multiply(r).tocsr(),
        [bon_train, bon_dev, bon_test])
    return bon_train, bon_dev, bon_test


def gridsearch_on_C(model:LogisticRegression, Cs, X_train, X_dev, X_test, y_train, y_dev, y_test):
    best_dev_acc = 0.
    best_C = None
    test_acc = 0.
    for C in Cs:
        model.set_params(C=C)
        model.fit(X_train, y_train)
        dev_acc = model.score(X_dev, y_dev)
        if dev_acc > best_dev_acc:
            best_dev_acc = dev_acc
            best_C = C
    model.set_params(C=best_C)
    model.fit(X_train, y_train)
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    return best_C, train_acc, dev_acc, test_acc

def normalize_text(text):
    '''
    preprocess a doc from the original imdb dataset
    '''
    text = re.sub(r'([\.",\(\)\!\?:;])', r' \1 ', text.lower())  # find listed punctuation marks and add a space in each side
    text = re.sub('<br />|\x85', ' ', text)  # replace non-informational tag/symbol with space (remove them)
    return text


In [8]:
os.chdir(results_dir)
bon_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), preprocessor=normalize_text)
rs = np.random.RandomState(random_seed)
test_id = 0
pbar = tqdm(total = sum(repeat_times_list))
for train_size, repeat_times in zip(train_size_list, repeat_times_list):
    for i in range(repeat_times):
        # sample the training set
        if train_size < 20000:
            train_inds, _ = train_test_split(np.arange(len(X_train)), train_size=train_size, random_state=rs, stratify=y_train)
        else:
            train_inds = rs.permutation(20000)
        X_train_sampled = X_train[train_inds]
        y_train_sampled = y_train[train_inds]
        docs_train_sampled = [docs_train[d_i] for d_i in train_inds]
        sampled_inds[test_id] = train_inds.tolist() # save the sampled train inds
        # get bon 
        bon_train, bon_dev, bon_test = get_nb_bon(docs_train_sampled, docs_dev, docs_test, y_train_sampled, bon_vectorizer)
        for model_, multipliers_ in zip(["DV", "BON", "DV + BON"], [[1.], [1.], multipliers]):
            for multiplier in multipliers_:
                if model_=="DV":
                    X_train_m, X_dev_m, X_test_m = X_train_sampled, X_dev, X_test
                elif model_=="BON":
                    X_train_m, X_dev_m, X_test_m = bon_train, bon_dev, bon_test
                elif model_=="DV + BON":
                    X_train_m, X_dev_m, X_test_m = hstack((bon_train, X_train_sampled*multiplier), "csr"),\
                        hstack((bon_dev, X_dev*multiplier), "csr"),\
                        hstack((bon_test, X_test*multiplier), "csr")
                # tune and train the model
                model = LogisticRegression()
                best_C, train_acc, dev_acc, test_acc = gridsearch_on_C(model, 
                    Cs, X_train_m, X_dev_m, X_test_m, y_train_sampled, y_dev, y_test)

                # save the result and sampled ids
                res['id'].append(test_id)
                res['train_size'].append(train_size)
                res["C"].append(best_C)
                res["multiplier"].append(multiplier)
                res["train_acc"].append(train_acc)
                res["dev_acc"].append(dev_acc)
                res["test_acc"].append(test_acc)
                res["model"].append(model_)

        test_id += 1
        pbar.update()

    with open(result_fn, 'w') as f:
        json.dump(res, f)
    with open(sampled_inds_fn, 'w') as f:
        json.dump(sampled_inds, f)
pbar.close()
os.chdir('..')





100%|██████████| 181/181 [2:38:58<00:00, 52.70s/it]  
