In [1]:
import torch
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, GPT2Tokenizer, GPT2Model

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

from tqdm import tqdm

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'],
                                      remove=("header",))

In [2]:
def split_data(x):
    return train_test_split(x, newsgroups_train.target, test_size=0.2, random_state=42)

In [3]:
def get_res(typename, dataset):
    x_train, x_test, y_train, y_test = split_data(dataset)
    print(typename)

    clf = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"Random Forest:     {round(acc_test, 3)}")

    '''
    clf = GradientBoostingClassifier(random_state=42, n_estimators=5)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"Gradient Boosting: {round(acc_test, 3)}")
    '''

    clf = AdaBoostClassifier(algorithm='SAMME', random_state=42, n_estimators=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"Ada Boost:         {round(acc_test, 3)}")

In [4]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('english') + special_sym


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = lemmatizer.lemmatize(word.lower())
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None\
            and not any(sym in word for sym in special_sym):
            res += [word]
    return res

In [5]:
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased")
model_bert = BertModel.from_pretrained("bert-base-uncased")

In [6]:
tokenizer_GPT2 = GPT2Tokenizer.from_pretrained("gpt2")
model_GPT2 = GPT2Model.from_pretrained("gpt2")

In [7]:
tokenizer_roberta = RobertaTokenizer.from_pretrained("roberta-base")
model_roberta = RobertaModel.from_pretrained("roberta-base")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
tokenize_data = [delete_stopword_and_lemmatize(nltk.word_tokenize(newsgroups_train.data[i]))
                 for i in range(len(newsgroups_train.data))]

In [9]:
preproc_data_bert = [tokenizer_bert(" ".join(tokenize_data[i]), padding=True, truncation=True, return_tensors="pt")
                     for i in range(len(tokenize_data))]

In [10]:
preproc_data_roberta = [tokenizer_roberta(" ".join(tokenize_data[i]), padding=True, truncation=True, return_tensors="pt")
                        for i in range(len(tokenize_data))]

In [11]:
preproc_data_GPT2 = [tokenizer_GPT2(" ".join(tokenize_data[i]), truncation=True, return_tensors="pt")
                     for i in range(len(tokenize_data))]

In [None]:
with torch.no_grad():
    vec_data_bert, vec_data_roberta, vec_data_GPT2 = [], [], []
    len_docs = len(tokenize_data)

    with tqdm(total=len_docs, position=0, leave=True) as pbar:
        for idxdoc in range(len_docs):

            pbar.set_description(f"Doc: {idxdoc+1}/{len_docs}")
            pbar.update()

            vec_data_bert += [model_bert(**preproc_data_bert[idxdoc]).last_hidden_state[:, 0, :].tolist()[0]]
            vec_data_roberta += [model_roberta(**preproc_data_roberta[idxdoc]).last_hidden_state[:, 0, :].tolist()[0]]
            vec_data_GPT2 += [model_GPT2(**preproc_data_GPT2[idxdoc]).last_hidden_state[:, 0, :].tolist()[0]]

Doc: 18/4740:   0%|▏                                                               | 18/4740 [00:29<3:05:44,  2.36s/it]

In [54]:
get_res("Bert", vec_data_bert)
print("--------------------------------")
get_res("Roberta:", vec_data_roberta)
print("--------------------------------")
get_res("GPT2:", vec_data_GPT2)

ZZZ
Random Forest:     0.818
Ada Boost:         0.66
