In [1]:
import random

""" Imports """

import re

import nltk
import pandas as pd

# Загрузка необходимых ресурсов
nltk.download('vader_lexicon')
nltk.download('gutenberg')
nltk.download('book')
nltk.download('punkt') 
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('reuters')

from nltk import NaiveBayesClassifier, classify, DecisionTreeClassifier
from nltk.book import *
from nltk.text import Text as nltk_text_type

from nltk.tokenize import word_tokenize

from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.corpus import wordnet

from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/kapuchinka/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/kapuchinka/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/kapuchinka/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/kapuchinka/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /home/kapuchinka/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/kapuchinka/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloadi

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


### Модуль Токенизация Слов

In [2]:
def tokenization_words(text: nltk_text_type, language: str = 'english'):
    word_list = text.tokens
    text_stroke = " ".join(word_list)
    words = word_tokenize(text_stroke, language)
    words_tokenize = [word for word in words if re.match(r'\w+', word)]   
    
    return words_tokenize

### Модуль удаления Стоп-Слов

In [3]:
def remove_stopwords(words_tokenize, language: str = 'english'):
    stop_words = set(stopwords.words(language))
    words_without_stopwords = [word for word in words_tokenize if word.lower() not in stop_words]     
    
    return words_without_stopwords

### Лемматизация

In [4]:
def get_wordnet_pos(treebank_tag):
    """
    Преобразует POS-тег NLTK в формат WordNet.
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatizate_text(filtered_words):
    pos_tags = pos_tag(filtered_words)
    
    lemmatized_tokens = []
    for word, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word.lower(), pos=wordnet_pos)
        lemmatized_tokens.append(lemma)

    return lemmatized_tokens

In [5]:
def filtration_documents(corpus):
    local_documents = []
    for category in corpus.categories():
        for fileid in corpus.fileids(category):
            tokenize_words = corpus.words(fileid)
            words_without_stopwords = remove_stopwords(tokenize_words, language='english')
            lemmas_words = lemmatizate_text(words_without_stopwords)
            local_documents.append((lemmas_words, category))
           
    random.shuffle(local_documents) 
    return local_documents

### Часто встречающиеся слова

In [6]:
def most_common_words(corpus):
    tokenize_words = corpus.words()
    words_without_stopwords = remove_stopwords(tokenize_words, language='english')
    
    words_map = FreqDist(words_without_stopwords)
    filtered_words_map = [word for word in words_map.keys() if words_map[word] >= 10]
    
    local_word_features = filtered_words_map[:3000]
    
    return local_word_features

### Создание вектора признаков

In [7]:
def document_features(document, param_word_features):
    document_words = set(document)
    features = {}

    # Признаки наличия слов из word_features
    for word in param_word_features:
        features[f'contains({word})'] = (word in document_words)

    return features

### Получение обучающей и тестовых выборок

In [8]:
def get_sets(param_documents):
    word_features = most_common_words(reuters)
    featuresets = [(document_features(d, word_features), c) for (d, c) in param_documents]

    test_size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[test_size:], featuresets[:test_size]
    
    return {
        'test_size': test_size,
        'train_set': train_set,
        'test_set': test_set
    }

### Обучение и получение Наивного Байесовского Классификатора

In [9]:
def get_naive_bayes_classifier(param_train_set):
    local_nb_classifier = NaiveBayesClassifier.train(param_train_set)
    
    return local_nb_classifier

### Получение точности Наивного Байесовского Классификатора

In [10]:
def get_accurancy_naive_bayes_classifier(classifier, param_test_set):
    local_nb_accuracy = classify.accuracy(classifier, param_test_set)
    print(f'Naive Bayes Accuracy: {local_nb_accuracy:.2f}')

### Получение информативных признаков

In [11]:
def get_informative_features(classifier, n):
    classifier.show_most_informative_features(n)

### Обучение и получение Дерева Принятия Решений 

In [12]:
def get_decision_tree_classifier(param_train_set):
    local_dt_classifier = DecisionTreeClassifier.train(param_train_set)
    return local_dt_classifier

### Получение точности Дерева Принятия Решений

In [13]:
def get_accurancy_decision_tree_classifier(classifier, param_test_set):
    local_dt_accuracy_nltk = classify.accuracy(classifier, param_test_set)
    print(f'Decision Tree Accuracy: {local_dt_accuracy_nltk:.2f}')

### Получение оценки точности при использовании комбинированного классификатора

In [14]:
def ensemble_classify(features, param_nb_classifier, param_dt_classifier):
    nb_vote = param_nb_classifier.classify(features)
    dt_vote = param_dt_classifier.classify(features)
    return dt_vote if nb_vote == dt_vote else nb_vote

### Тестирование

In [15]:
def testing(n, param_test_set, param_nb_classifier, param_dt_classifier):
    results = []
    
    for _ in range(n):
        sample = random.choice(param_test_set)
        test_features, true_label = sample
        nb_result = param_nb_classifier.classify(test_features)
        dt_result = param_dt_classifier.classify(test_features)
        ensemble_result = ensemble_classify(test_features, param_nb_classifier, param_dt_classifier)
    
        # Добавляем результаты в список
        results.append([
            true_label,
            nb_result,
            dt_result,
            ensemble_result
        ])
        
    columns = ['Категория', 'Наивный Байесовский', 'Дерево Решений', 'Комбинированный']
    results_df = pd.DataFrame(results, columns=columns)
    
    return results_df

# Эксперимент

In [16]:
documents = filtration_documents(reuters)
sets = get_sets(documents)
nb_classifier = get_naive_bayes_classifier(sets['train_set'])
dt_classifier = get_decision_tree_classifier(sets['train_set'])

In [17]:
get_accurancy_naive_bayes_classifier(nb_classifier, sets['test_set'])

Naive Bayes Accuracy: 0.61


In [18]:
get_informative_features(nb_classifier, 10)

Most Informative Features
        contains(coffee) = True           coffee : earn   =   2360.4 : 1.0
          contains(palm) = True           palm-o : earn   =   2337.1 : 1.0
       contains(sorghum) = True           sorghu : earn   =   2334.1 : 1.0
        contains(nickel) = True           nickel : earn   =   2238.3 : 1.0
     contains(economist) = True             rand : earn   =   2073.8 : 1.0
          contains(crop) = True           copra- : earn   =   1481.2 : 1.0
         contains(juice) = True           orange : earn   =   1412.9 : 1.0
          contains(rice) = True             rice : acq    =   1389.1 : 1.0
         contains(cargo) = True           propan : earn   =   1382.5 : 1.0
       contains(naphtha) = True           naphth : acq    =   1324.1 : 1.0


In [19]:
get_accurancy_decision_tree_classifier(dt_classifier, sets['test_set'])

Decision Tree Accuracy: 0.58


In [20]:
testing(20, sets['test_set'], nb_classifier, dt_classifier)

Unnamed: 0,Категория,Наивный Байесовский,Дерево Решений,Комбинированный
0,trade,trade,trade,trade
1,acq,acq,acq,acq
2,interest,interest,dlr,interest
3,sorghum,barley,grain,barley
4,acq,acq,acq,acq
5,earn,earn,earn,earn
6,interest,interest,interest,interest
7,grain,wheat,grain,wheat
8,earn,earn,earn,earn
9,coffee,coffee,coffee,coffee
