# Лабораторная работа №6
## Васильев А.Р. ИУ5-24М

### Цель лабораторной работы: изучение методов классификации текстов.

### Требования к отчету:
#### Отчет по лабораторной работе должен содержать:

- титульный лист;
- описание задания;
- текст программы;
- экранные формы с примерами выполнения программы.

#### Задание - для произвольного набора данных, предназначенного для классификации текстов, решите задачу классификации текста двумя способами:

- Способ 1. На основе CountVectorizer или TfidfVectorizer.
- Способ 2. На основе моделей word2vec или Glove или fastText.







In [None]:
import nltk
import spacy
import numpy as np
from sklearn.datasets import fetch_20newsgroups
nltk.download('punkt')
from nltk import tokenize
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Будем использовать датасет 20 newsgroups

In [None]:
categories = ["rec.autos", "rec.sport.hockey", "sci.crypt", "sci.med", "talk.religion.misc"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

In [None]:
unique, frequency = np.unique(newsgroups_train.target, 
                              return_counts = True)

In [None]:
for l, f in zip(unique, frequency):
  print(f'value: {l}, count: {f}')

value: 0, count: 594
value: 1, count: 600
value: 2, count: 595
value: 3, count: 594
value: 4, count: 377


In [None]:
print('Tokenizers NLTK have')
for i in dir(tokenize)[:16]:
  print(i)

Tokenizers NLTK have
BlanklineTokenizer
LineTokenizer
MWETokenizer
PunktSentenceTokenizer
RegexpTokenizer
ReppTokenizer
SExprTokenizer
SpaceTokenizer
StanfordSegmenter
TabTokenizer
TextTilingTokenizer
ToktokTokenizer
TreebankWordTokenizer
TweetTokenizer
WhitespaceTokenizer
WordPunctTokenizer


## Подготовка текстов

In [None]:
from spacy.lang.en import English
import spacy
from nltk.corpus import stopwords

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
nltk.download('stopwords')
stopwords_eng = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def prepare(t):
  # t = ' '.join([i.strip().lower() for i in t.split(' ')])
  t = re.sub(r'[^a-zA-Z0-9 \n]', '', t)
  t = re.sub('\s+', ' ', t)
  t = ' '.join([token.lemma_.lower() for token in nlp(t) if token not in stopwords_eng])
  return t

texts = newsgroups_train.data

texts_array = []

for text in texts:
  prepared_text = prepare(text)
  texts_array.append(prepared_text)

KeyboardInterrupt: ignored

In [None]:
len(texts_array), texts_array[-1]

In [None]:
test_texts_arr = []

test_texts = newsgroups_test.data

for text in test_texts:
  prepared_text = prepare(text)
  test_texts_arr.append(prepared_text)

## Способ 1 На основе CountVectorizer и TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
tfidf_vectorizer = TfidfVectorizer()

train_feature_matrix_tfidf = tfidf_vectorizer.fit_transform(texts_array)
test_feature_matrix__tfidf = tfidf_vectorizer.transform(test_texts_arr)

NameError: ignored

In [None]:
count_vectorizer = CountVectorizer()

train_feature_matrix_count = count_vectorizer.fit_transform(texts_array)
test_feature_matrix_count = count_vectorizer.transform(test_texts_arr)

NameError: ignored

In [None]:
target_values_train = newsgroups_train.target
target_values_test = newsgroups_test.target

knn with count vectorizer

In [None]:
knn_count = KNeighborsClassifier()

knn_count.fit(train_feature_matrix_count, target_values_train)
pred_count = knn_count.predict(test_feature_matrix_count)

print(classification_report(target_values_test, pred_count))

              precision    recall  f1-score   support

           0       0.38      0.71      0.49       396
           1       0.64      0.55      0.59       399
           2       0.63      0.53      0.58       396
           3       0.53      0.36      0.43       396
           4       0.57      0.35      0.44       251

    accuracy                           0.51      1838
   macro avg       0.55      0.50      0.51      1838
weighted avg       0.55      0.51      0.51      1838



knn with tfidf vectorizer

In [None]:
knn_tfidf = KNeighborsClassifier()

knn_tfidf.fit(train_feature_matrix_tfidf, target_values_train)
pred_knn = knn_tfidf.predict(test_feature_matrix__tfidf)

print(classification_report(target_values_test, pred_knn))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       396
           1       0.98      0.90      0.94       399
           2       0.84      0.90      0.87       396
           3       0.94      0.65      0.77       396
           4       0.57      0.88      0.69       251

    accuracy                           0.85      1838
   macro avg       0.85      0.85      0.84      1838
weighted avg       0.88      0.85      0.85      1838



## Способ 2 На основе моделей word2vec или Glove или fastText.

In [None]:
import tqdm
from gensim.models import Word2Vec
import gensim.downloader
# gensim.downloader.info()
# glove_vectors = gensim.downloader.load('glove-twitter-25')
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')



In [None]:
class GloveTokenizer:
  def __init__(self, glove_tokenizer):
    self.glove = glove_tokenizer
    self.token_length = 800
    self.embedding_size = 50

  def __getitem__(self, word):
    try:
      vector = glove_vectors.get_vector(word).reshape(1, self.embedding_size)
    except KeyError as e:
      vector = np.zeros((1, self.embedding_size))
    return vector


  def __padd(self, sentence):
    padded_sentence = np.zeros((self.token_length, self.embedding_size))
    for i, token in enumerate(sentence):
        padded_sentence[i] = token
    return padded_sentence
  
  def tokenize(self, sentence):
    encoded_sentence = []
    sentence = sentence.strip(' ').split(' ')
    for i in sentence:
      token = self.__getitem__(i)
      encoded_sentence.append(token)    
    return np.array(self.__padd(encoded_sentence), dtype=np.float16)

tokenizer = GloveTokenizer(glove_vectors)    

In [None]:
def prepare(t):
  # t = ' '.join([i.strip().lower() for i in t.split(' ')])
  t = re.sub(r'[^a-zA-Z0-9 \n]', '', t)
  t = re.sub('\s+', ' ', t)
  lemmas = [token.lemma_.lower() for token in nlp(t) if token not in stopwords_eng]
  t = ' '.join(lemmas)
  vectors = tokenizer.tokenize(t)
  return vectors, len(lemmas)

vectors_array_train = []
labels_train = []

for enum, text, label in zip(range(len(newsgroups_train.data)), newsgroups_train.data, newsgroups_train.target):
  try:
    vector, length = prepare(text)
    # print(vector, vector.shape)
    vectors_array_train.append(vector)
    labels_train.append(label)
  except IndexError as e:
    print(enum, e)
    continue


vectors_array_train = np.array(vectors_array_train)
print(vectors_array_train.shape)
train_data = vectors_array_train.reshape((-1, vectors_array_train.shape[1]*vectors_array_train.shape[2]))
train_data.shape

56 index 800 is out of bounds for axis 0 with size 800
58 index 800 is out of bounds for axis 0 with size 800
93 index 800 is out of bounds for axis 0 with size 800
112 index 800 is out of bounds for axis 0 with size 800
145 index 800 is out of bounds for axis 0 with size 800
147 index 800 is out of bounds for axis 0 with size 800
159 index 800 is out of bounds for axis 0 with size 800
214 index 800 is out of bounds for axis 0 with size 800
215 index 800 is out of bounds for axis 0 with size 800
217 index 800 is out of bounds for axis 0 with size 800
222 index 800 is out of bounds for axis 0 with size 800
225 index 800 is out of bounds for axis 0 with size 800
248 index 800 is out of bounds for axis 0 with size 800
265 index 800 is out of bounds for axis 0 with size 800
267 index 800 is out of bounds for axis 0 with size 800
268 index 800 is out of bounds for axis 0 with size 800
281 index 800 is out of bounds for axis 0 with size 800
298 index 800 is out of bounds for axis 0 with size

(2610, 40000)

In [None]:
vectors_array_test = []
labels_test= []

for enum, text, label in zip(range(len(newsgroups_test.data)), newsgroups_test.data, newsgroups_test.target):
  try:
    vector, length = prepare(text)
    vectors_array_test.append(vector)
    labels_test.append(label)
  except IndexError as e:
    print(enum, e)
    continue

67 index 800 is out of bounds for axis 0 with size 800
76 index 800 is out of bounds for axis 0 with size 800
124 index 800 is out of bounds for axis 0 with size 800
137 index 800 is out of bounds for axis 0 with size 800
155 index 800 is out of bounds for axis 0 with size 800
187 index 800 is out of bounds for axis 0 with size 800
292 index 800 is out of bounds for axis 0 with size 800
298 index 800 is out of bounds for axis 0 with size 800
350 index 800 is out of bounds for axis 0 with size 800
432 index 800 is out of bounds for axis 0 with size 800
435 index 800 is out of bounds for axis 0 with size 800
458 index 800 is out of bounds for axis 0 with size 800
476 index 800 is out of bounds for axis 0 with size 800
484 index 800 is out of bounds for axis 0 with size 800
525 index 800 is out of bounds for axis 0 with size 800
556 index 800 is out of bounds for axis 0 with size 800
558 index 800 is out of bounds for axis 0 with size 800
618 index 800 is out of bounds for axis 0 with siz

In [None]:
vectors_array_test = np.array(vectors_array_test)
test_data = vectors_array_test.reshape((-1, vectors_array_test.shape[1]*vectors_array_test.shape[2]))
test_data.shape

(1770, 40000)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()

knn_clf.fit(train_data, labels_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
pred = knn_clf.predict(test_data[:800])

print(classification_report(labels_test[:800], pred))

              precision    recall  f1-score   support

           0       0.31      0.70      0.43       172
           1       0.59      0.25      0.36       173
           2       0.46      0.44      0.45       179
           3       0.38      0.17      0.24       162
           4       0.33      0.28      0.30       114

    accuracy                           0.38       800
   macro avg       0.42      0.37      0.36       800
weighted avg       0.42      0.38      0.36       800

