In [None]:
# Take a model checkpoint and run it on the test split of a given dataset

In [1]:
import os
import csv
import torch
import numpy as np
from sklearn.manifold import TSNE
import time
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})
RS = 123
from sklearn.decomposition import PCA
import pandas as pd
import random

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f022c084c30>

output_list.pt stores the stacked predictions of the test samples in the format of [#samples, #classes]; label_list stores the true labels of all samples in a list of length #samples;

In [2]:
from pandas.core import base

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import time
import torch.nn.functional as F

import sys
#sys.path.append("../") # go to parent dir

from dataset import NumpyDataset, TransformerEnsembleDataset, TrainSamplerMultiClass, TrainSampler, TrainSamplerMultiClassUnit
from models import AggregateFeatEnsemble, DynamicWeightEnsemble, LogisticRegression, BertClassiferHyperparams, SimpleEnsemble, FixedWeightEnsemble
from utils import *  # bad practice, nvm
from contrastive_utils import compute_sim_matrix, compute_target_matrix, contrastive_loss

ckpt_dir = '../exp_data'

[nltk_data] Downloading package stopwords to /home/aibo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/aibo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def load_dataset_dataframe(source):
    print("Loading and processing dataframe")

    dataset_path = "datasets"

    dataset_file_name = {
        "enron": 'full_enron.csv',
        "imdb": 'full_imdb_feat.csv',
        "imdb62": 'full_imdb62.csv',
        "blog": 'full_blog.csv',
        "ccat50": "ccat50-auth-index.csv"
    }

    df = None

    if source == "enron":
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))
        df['name'] = df['From'].apply(lambda x: x.split("'")[1].split(".")[0])
        df['name_in_mail'] = df.apply(lambda x: is_name_in_email(x['name'], x['content']), axis=1)
        df = df[df['name_in_mail'] == 0]
        df = df[df['content'].apply(lambda x: '-----' not in str(x))]
        df = df[df['content'].apply(lambda x: "@" not in str(x))]
        df = df[df['content'].apply(lambda x: "From: " not in str(x))]
        df = df[df['content'].apply(lambda x: len(str(x).split()) > 10)]
        df.to_csv(os.path.join(dataset_path, 'full_enron2.csv'))

    elif source == "imdb":
        feat_path = os.path.join(dataset_dir, "full_imdb_feat.csv")
        if os.path.isfile(feat_path):
            df = pd.read_csv(feat_path, index_col=0)
        else:
            # # Parallelize apply on Pandas
            from pandarallel import pandarallel
            pandarallel.initialize()

            df = pd.read_csv(os.path.join(dataset_dir, 'full_imdb.csv'), index_col=0)
            print("drop rows!!!!!!!!!!!!!")
            drop_count = 0
            for index, row in df.iterrows():
                # print(row['content'])
                if len(str(row['content'])) <= 3:
                    df.drop(index, inplace=True)
                    drop_count += 1
            print(f"dropped {drop_count} rows")
            print(df.shape)
            df['content_tfidf'] = df['content'].parallel_apply(lambda x: process(x))
            df[["avg_len", "len_text", "len_words", "num_short_w", "per_digit", "per_cap", "f_a", "f_b", "f_c", "f_d",
                "f_e", "f_f", "f_g", "f_h", "f_i", "f_j", "f_k", "f_l", "f_m", "f_n", "f_o", "f_p", "f_q", "f_r", "f_s",
                "f_t", "f_u", "f_v", "f_w", "f_x", "f_y", "f_z", "f_0", "f_1", "f_2", "f_3", "f_4", "f_5", "f_6", "f_7",
                "f_8", "f_9", "f_e_0", "f_e_1", "f_e_2", "f_e_3", "f_e_4", "f_e_5", "f_e_6", "f_e_7", "f_e_8", "f_e_9",
                "f_e_10", "f_e_11", "richness"]] = df['content'].parallel_apply(lambda x: extract_style(x))
            df.to_csv(feat_path)

    elif source == "imdb62":
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))
        df = pd.read_csv(os.path.join(dataset_dir, "full_imdb62.csv"), index_col=0)

    elif source == "blog":
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))

    elif source == "ccat50":
        feat_path = os.path.join(dataset_dir, "full_ccat50_feat.csv")
        if os.path.isfile(feat_path):
            df = pd.read_csv(feat_path, index_col=0)
        else:
            df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))
            from pandarallel import pandarallel
            pandarallel.initialize()
            df['content_tfidf'] = df['content'].parallel_apply(lambda x: process(x))
            df[["avg_len", "len_text", "len_words", "num_short_w", "per_digit", "per_cap", "f_a", "f_b", "f_c", "f_d",
                    "f_e", "f_f", "f_g", "f_h", "f_i", "f_j", "f_k", "f_l", "f_m", "f_n", "f_o", "f_p", "f_q", "f_r", "f_s",
                    "f_t", "f_u", "f_v", "f_w", "f_x", "f_y", "f_z", "f_0", "f_1", "f_2", "f_3", "f_4", "f_5", "f_6", "f_7",
                    "f_8", "f_9", "f_e_0", "f_e_1", "f_e_2", "f_e_3", "f_e_4", "f_e_5", "f_e_6", "f_e_7", "f_e_8", "f_e_9",
                    "f_e_10", "f_e_11", "richness"]] = df['content'].parallel_apply(lambda x: extract_style(x))
        df.to_csv(feat_path)
    elif source == "turing":
        df = pd.read_csv(os.path.join(dataset_path, "turing_AA_train_test.csv"))
        df.sort_values(by=['train', 'From'], inplace=True, ascending=[False, True])

    return df

In [4]:
def test_bert(nlp_train, nlp_test, tqdm_on, num_authors = 0, 
              return_features=True, model_name='microsoft/deberta-base', embed_len=768, ckpt_name='', table_name=''):

    print("#####")
    print("Training BERT")
    from models import LogisticRegression
    from dataset import BertDataset
    from models import BertClassifier

    tokenizer, extractor = None, None
    if 'bert-base' in model_name:
        from transformers import BertTokenizer, BertModel
        tokenizer = BertTokenizer.from_pretrained(model_name)
        extractor = BertModel.from_pretrained(model_name)
    elif 'deberta' in model_name:
        from transformers import DebertaTokenizer, DebertaModel
        tokenizer = DebertaTokenizer.from_pretrained(model_name)
        extractor = DebertaModel.from_pretrained(model_name)
    elif 'roberta' in model_name:  # roberta-base
        from transformers import RobertaTokenizer, RobertaModel
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        extractor = RobertaModel.from_pretrained(model_name)
    elif 'gpt2' in model_name:
        from transformers import GPT2Tokenizer, GPT2Model
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        extractor = GPT2Model.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token  # for gpt tokenizer only
    elif 'gpt' in model_name:  # 'openai-gpt'
        from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
        tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name)
        extractor = OpenAIGPTModel.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.unk_token  # for gpt tokenizer only
        print(f'pad token {tokenizer.unk_token}')
    elif 'xlnet' in model_name:
        from transformers import XLNetTokenizer, XLNetModel
        tokenizer = XLNetTokenizer.from_pretrained(model_name)
        extractor = XLNetModel.from_pretrained(model_name)
    else:
        raise NotImplementedError(f"model {model_name} not implemented")

    # freeze extractor
    for param in extractor.parameters():
        param.requires_grad = True

    # business logic
    #train_x, train_y = nlp_train['content'].tolist(), nlp_train['Target'].tolist()
    #val_x, val_y = nlp_val['content'].tolist(), nlp_val['Target'].tolist()
    test_x, test_y = nlp_test['content'].tolist(), nlp_test['Target'].tolist()

    # training setup
    num_epochs, base_lr, base_bs, ngpus, dropout = 1, 1e-5, 8, torch.cuda.device_count(), 0.35
    num_tokens, hidden_dim, out_dim = 256, 512, max(test_y) + 1
    model = BertClassifier(extractor, LogisticRegression(embed_len * num_tokens, hidden_dim, out_dim, dropout=dropout))
#     model.load_state_dict(torch.load("./exp_data/3b_val_bert-base-cased_coe1.0_temp0.1_unit2_epoch8/3b_val_val0.61900_finale7.pt"))
    model.load_state_dict(torch.load(f'./exp_data/{ckpt_name}'))
    model = nn.DataParallel(model).cuda()
    
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=base_lr * ngpus, weight_decay=3e-4)
    criterion = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, verbose=True)

#     train_set = BertDataset(train_x, train_y, tokenizer, num_tokens)
    #val_set = BertDataset(val_x, val_y, tokenizer, num_tokens)
    test_set = BertDataset(test_x, test_y, tokenizer, num_tokens, return_idx=True)

    coefficient, temperature, sample_unit_size = 0.0, 0.1, 2
    print(f'coefficient, temperature, sample_unit_size = {coefficient, temperature, sample_unit_size}')

    # recorder
    exp_dir = os.path.join(ckpt_dir, f'{id}_{model_name.split("/")[-1]}_coe{coefficient}_temp{temperature}_unit{sample_unit_size}_epoch{num_epochs}')
    writer = SummaryWriter(os.path.join(exp_dir, 'board'))

#     train_sampler = TrainSamplerMultiClassUnit(train_set, sample_unit_size=sample_unit_size)
#     train_loader = DataLoader(train_set, batch_size=base_bs * ngpus, sampler=train_sampler, shuffle=False,
#                               num_workers=4 * ngpus, pin_memory=True, drop_last=True)
    #val_loader = DataLoader(val_set, batch_size=base_bs * ngpus, shuffle=False, num_workers=4 * ngpus,
    #                        pin_memory=True, drop_last=True)
    test_loader = DataLoader(test_set, batch_size=base_bs * ngpus, shuffle=False, num_workers=4 * ngpus,
                             pin_memory=True, drop_last=True)
    
    model.eval()
    pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=not tqdm_on)
#     output_list = torch.randn((6,10)).cuda()
#     feature_list = torch.randn((6, 100)).cuda()
    truth_list = []
    pred_list = []
    content_list = []
#     m = torch.nn.AvgPool2d((1, 2000), stride=(1, 1950))
    totalx = 0
    totaly = 0
    with torch.no_grad():
        test_acc = AverageMeter()
        for i, (x1, x2, x3, y, idx, content) in enumerate(pg):
            x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
            pred = model(x)
            correct = (pred.argmax(1) == y).sum().item()
            totalx += correct
#             print(correct)
            totaly += len(y)
            test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
            pg.set_postfix({
                    'test acc': '{:.6f}'.format(test_acc.avg),
                })
            pred_label = np.argmax(pred.cpu().numpy(), axis=1).astype(np.int32)
            idx = idx.cpu()
            truth_label = y.cpu()
            if i == 0:
                pred_list = pred_label
                truth_list = truth_label
                idx_list = idx
                content_list = content
            else:
                pred_list = np.append(pred_list, pred_label)
                truth_list = np.append(truth_list, truth_label)
                idx_list = np.append(idx_list, idx)
                content_list = np.append(content_list, content)
#                 print(pred_list)
#                 print(truth_list)
#             if i == 100:
#                 break

#     print(test_acc.avg)
#     print(totalx)
#     print(totaly)
    correct_preds = [[] for i in range(num_authors)] # np.array([np.array([]) for i in range(10)])
    wrong_preds = [[] for i in range(num_authors)] # np.array([np.array([]) for i in range(10)])

    for i in range(len(idx_list)):
        if truth_list[i] == pred_list[i]:
            correct_preds[truth_list[i]].append(idx_list[i])
        else:
            wrong_preds[truth_list[i]].append([idx_list[i], pred_list[i]])
        
    header = ['index', 'author', 'is_correct', 'actual_pred', 'content']
    table_name = f'./{table_name}'
    f = open(table_name + ".csv", 'w', encoding='UTF8')
    writer = csv.writer(f)
    writer.writerow(header)
    
    for author in range(num_authors):
        for correct_pred in correct_preds[author]:
            writer.writerow([correct_pred, author, 1, author, content_list[correct_pred]])
        for wrong_pred in wrong_preds[author]:
            writer.writerow([wrong_pred[0], author, 0, wrong_pred[1], content_list[wrong_pred[0]]])
    
    f.close()

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

source = "turing"
df = load_dataset_dataframe(source)
list_senders = [20]

if source == "imdb62":
        list_senders = [62]

# start testing
for limit in list_senders:
    print("Number of authors: ", limit)
    nlp_train, nlp_test = None, None
    if source == "turing":
        df = pd.read_csv('datasets/turing_AA_train_test.csv')
        #print(df)
        nlp_train = df[df['train'] == 1]
        nlp_test = df[df['train'] == 0]
        nlp_test['Target'] = nlp_test['From']
    else:
        nlp_train, _1, nlp_test, _2, _3 = build_train_test(df, source, limit, per_author=None, seed=1)
    test_bert(nlp_train,
              nlp_test,
              tqdm_on=True,
              return_features=True,
             model_name='bert-base-cased',
              num_authors=limit,
              ckpt_name='21b15_val0.80590_e10.pt',
              table_name='21b'
             )


Loading and processing dataframe
Number of authors:  20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nlp_test['Target'] = nlp_test['From']


#####
Training BERT


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Logistic Regression classifier of dim (196608 512 20)


FileNotFoundError: [Errno 2] No such file or directory: './exp_data/21b15_val0.80590_e10.pt'