In [25]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import re
from tqdm import tqdm
from math import log
from random import shuffle


lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'])

In [26]:
def split_data(x):
    return train_test_split(x, newsgroups_train.target, test_size=0.2, random_state=42)

In [27]:
tokenize_data = [nltk.word_tokenize(newsgroups_train.data[i])
                 for i in range(len(newsgroups_train.data))]

In [28]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
stop_words = stopwords.words('english') +\
            ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]",
             "[", ".", "``", "\'\'", "--", "!", "-", "*", ".."]


def delete_stopword(listw):
    res = []
    for word in listw:
        word = word.lower()
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None:
            res += [word]
    return res

In [29]:
tokenize_data = [delete_stopword(tokenize_data[i])
                 for i in range(len(tokenize_data))]

In [30]:
data = [(tokenize_data[i], newsgroups_train.target[i]) for i in range(len(tokenize_data))]
shuffle(data)

for i in range(len(data)):
    tokenize_data[i] = data[i][0]
    newsgroups_train.target[i] = data[i][1]

In [31]:
lemmatize_data = [list(map(lemmatizer.lemmatize, tokenize_data[i]))
                  for i in range(len(tokenize_data))]

In [32]:
stemize_data = [list(map(stemmer.stem, tokenize_data[i]))
                for i in range(len(tokenize_data))]

In [33]:
def get_vocab(data):
    vocab = []
    for sent in data:
        vocab += sent
    vocab = list(set(vocab))

    idx_word = {}
    for i in range(len(vocab)):
        idx_word[vocab[i]] = i

    return vocab, idx_word

In [34]:
def vectorize01(data):

    vocab, idx_word = get_vocab(data)

    vectorize01_data = []
    len_vocab = len(vocab)
    count_doc = len(data)

    with tqdm(total=count_doc, position=0, leave=True) as pbar:
        for i in range(count_doc):

            pbar.set_description(f"Doc: {i+1}/{count_doc}")
            pbar.update()

            doc = [0] * len_vocab
            for word in data[i]:
                doc[idx_word[word]] = 1

            vectorize01_data += [doc]

    return vectorize01_data

In [35]:
def vectorize0n(data):
    vocab, idx_word = get_vocab(data)

    vectorize0n_data = []
    len_vocab = len(vocab)
    count_doc = len(data)

    with tqdm(total=count_doc, position=0, leave=True) as pbar:
        for i in range(count_doc):

            pbar.set_description(f"Doc: {i+1}/{count_doc}")
            pbar.update()

            doc = [0] * len_vocab
            len_doc = len(data[i])
            for word in data[i]:
                doc[idx_word[word]] += 1 / len_doc

            vectorize0n_data += [doc]

    return vectorize0n_data

In [36]:
def count_docs_with_word(word, data):
    res = 0
    for doc in data:
        if word in doc:
            res += 1
    return res


def tf_idf(data):
    vocab, idx_word = get_vocab(data)

    vectorize_data = []
    len_vocab = len(vocab)
    count_doc = len(data)
    list_count_docs_with_word = [0 for i in range(len_vocab)]

    with tqdm(total=count_doc, position=0, leave=True) as pbar:
        for i in range(count_doc):
            pbar.set_description(f"Cycle: 1/2, Doc: {i+1}/{count_doc}")
            pbar.update()

            for word in set(data[i]):
                list_count_docs_with_word[idx_word[word]] += 1

    idf = [log(count_doc / elem) for elem in list_count_docs_with_word]

    with tqdm(total=count_doc, position=0, leave=True) as pbar:
        for i in range(count_doc):

            pbar.set_description(f"Cycle: 2/2, Doc: {i+1}/{count_doc}")
            pbar.update()

            doc = [0 for i in range(len_vocab)]
            len_doc = len(data[i])

            for word in data[i]:
                doc[idx_word[word]] += 1 / len_doc

            for word in set(data[i]):
                doc[idx_word[word]] *= idf[idx_word[word]]

            vectorize_data += [doc]

    return vectorize_data

In [37]:
def print_res(type_doc, type_vec, data):
    x_train, x_test, y_train, y_test = split_data(data)
    clf = DecisionTreeClassifier(random_state=42, max_depth=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"{type_doc} текст, векторизация {type_vec}: {round(acc_test, 3)}")

In [38]:
vectorize01_void_data = vectorize01(tokenize_data)
print_res("Необработанный", "0-1", vectorize01_void_data)

Doc: 2846/2846: 100%|█████████████████████████████████████████████████████████████| 2846/2846 [00:06<00:00, 417.12it/s]


Необработанный текст, векторизация 0-1: 0.779


In [39]:
vectorize0n_void_data = vectorize0n(tokenize_data)
print_res("Необработанный", "0-n", vectorize0n_void_data)

Doc: 2846/2846: 100%|█████████████████████████████████████████████████████████████| 2846/2846 [00:08<00:00, 327.89it/s]


Необработанный текст, векторизация 0-n: 0.758


In [24]:
vectorize_tfidf_void_data = tf_idf(tokenize_data)
print_res("Необработанный", "tf-idf", vectorize_tfidf_void_data)

Cycle: 1/2, Doc: 2846/2846: 100%|█████████████████████████████████████████████████| 2846/2846 [00:05<00:00, 545.12it/s]
Cycle: 2/2, Doc: 2846/2846: 100%|█████████████████████████████████████████████████| 2846/2846 [00:18<00:00, 150.90it/s]


Необработанный текст, векторизация tf-idf: 0.747


In [40]:
vectorize01_stemize_data = vectorize01(stemize_data)
print_res("Стеммированный", "0-1", vectorize01_stemize_data)

Doc: 2846/2846: 100%|█████████████████████████████████████████████████████████████| 2846/2846 [00:08<00:00, 343.71it/s]


Стеммированный текст, векторизация 0-1: 0.765


In [41]:
vectorize0n_stemize_data = vectorize0n(stemize_data)
print_res("Стеммированный", "0-n", vectorize0n_stemize_data)

Doc: 2846/2846: 100%|█████████████████████████████████████████████████████████████| 2846/2846 [00:09<00:00, 303.71it/s]


Стеммированный текст, векторизация 0-n: 0.774


In [42]:
vectorize_tfidf_stemize_data = tf_idf(stemize_data)
print_res("Стеммированный", "tf-idf", vectorize_tfidf_stemize_data)

Cycle: 1/2, Doc: 2846/2846: 100%|█████████████████████████████████████████████████| 2846/2846 [00:05<00:00, 487.14it/s]
Cycle: 2/2, Doc: 2846/2846: 100%|█████████████████████████████████████████████████| 2846/2846 [00:17<00:00, 158.21it/s]


Стеммированный текст, векторизация tf-idf: 0.77


In [43]:
vectorize01_lemmatize_data = vectorize01(lemmatize_data)
print_res("Лемматизированный", "0-1", vectorize01_lemmatize_data)

Doc: 2846/2846: 100%|█████████████████████████████████████████████████████████████| 2846/2846 [00:10<00:00, 278.48it/s]


Лемматизированный текст, векторизация 0-1: 0.761


In [44]:
vectorize0n_lemmatize_data = vectorize0n(lemmatize_data)
print_res("Лемматизированный", "0-n", vectorize0n_lemmatize_data)

Doc: 2846/2846: 100%|█████████████████████████████████████████████████████████████| 2846/2846 [00:07<00:00, 358.40it/s]


Лемматизированный текст, векторизация 0-n: 0.767


In [45]:
vectorize_tfidf_lemmatize_data = tf_idf(lemmatize_data)
print_res("Лемматизированный", "tf-idf", vectorize_tfidf_lemmatize_data)

Cycle: 1/2, Doc: 2846/2846: 100%|█████████████████████████████████████████████████| 2846/2846 [00:05<00:00, 540.67it/s]
Cycle: 2/2, Doc: 2846/2846: 100%|█████████████████████████████████████████████████| 2846/2846 [00:20<00:00, 137.18it/s]


Лемматизированный текст, векторизация tf-idf: 0.768


## Результаты (для 3 классов)

Необработанный текст, векторизация 0-1: 0.85 

Необработанный текст, векторизация 0-n: 0.847

Необработанный текст, векторизация tf-idf: 0.844

##
Стеммированный текст, векторизация 0-1: 0.871

Стеммированный текст, векторизация 0-n: 0.853

Стеммированный текст, векторизация tf-idf: 0.871

##
Лемматизированный текст, векторизация 0-1: 0.871

Лемматизированный текст, векторизация 0-n: 0.859

Лемматизированный текст, векторизация tf-idf: 0.871