In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from transformers import BertTokenizer, BertModel
import torch
from gensim import corpora, models

  from .autonotebook import tqdm as notebook_tqdm


**Loading data**

In [2]:
corpuses = []
for i in range(3):
    file = open('20docs' + str(i+1) + 'Dif.txt', 'r')
    corpuses.append([line.strip() for line in file])
    file.close()

In [3]:
categories = ['comp.graphics', 'rec.autos', 'sci.med',  'talk.politics.mideast']
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), categories=categories)
Y = newsgroups.target

**Vectorising**


In [22]:
corpus = corpuses[2]
score_svm = np.zeros(3)
score_bayes = np.zeros(3)
score_tree = np.zeros(3)

In [23]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(corpus).toarray()

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [25]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = [tokenizer(i, return_tensors='pt', padding=True, truncation=True).to(device) for i in corpus]
model = BertModel.from_pretrained('bert-base-uncased').to(device)
X_embed = np.array([[]])
for i in tokens:
    with torch.no_grad():
        outputs = model(**i)
        embedding = outputs.last_hidden_state[:, 0, :]
    embedding = embedding.data.cpu().numpy()
    X_embed = np.append(X_embed,embedding)
X_embed = np.reshape(X_embed, (3893,768))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
corpus = [doc.split() for doc in corpus]
dictionary = corpora.Dictionary(corpus)
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]
lda_model = models.LdaModel(bow_corpus, num_topics=20, id2word=dictionary)
document_topic_vectors = np.array([])
for doc_bow in bow_corpus:
 document_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
 document_topic_vector = [topic_prob for _, topic_prob in document_topics]
 document_topic_vectors = np.append(document_topic_vectors, document_topic_vector)
X_lda = np.reshape(document_topic_vectors, (3893,20))

**Training and tasting**

In [27]:
Xs = [X_tfidf, X_embed, X_lda]
for i, X in enumerate(Xs):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state= i * 11)
    svm = SVC()
    y_svm = svm.fit(X_train, y_train).predict(X_test)
    score_svm[i] = f1_score(y_test, y_svm, average='micro')
    bayes = GaussianNB() if i == 1 else MultinomialNB()
    y_bayes = bayes.fit(X_train, y_train).predict(X_test)
    score_bayes[i] = f1_score(y_test, y_bayes, average='micro')
    tree = DecisionTreeClassifier()
    y_tree = tree.fit(X_train,y_train).predict(X_test)
    score_tree[i] = f1_score(y_test, y_tree, average='micro')

**Exporting**

In [28]:
table = pd.DataFrame({'SVM': score_svm, 'Naive Bayes': score_bayes, 'Decision Tree': score_tree},
                     index=['TfIdf', 'Bert embedding', 'LDA'])
table.to_excel('Class.xlsx', float_format="%.2f")