<h1>А-02-21 Енгоян Сергей ЛР 3</h1>

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
categories = ['comp.windows.x', 'rec.sport.baseball', 'rec.sport.hockey']
remove = ('headers', 'footers', 'quotes')

twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=28, categories=categories, remove=remove)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=28, categories=categories, remove=remove)

In [5]:
text_clf_RF = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', RandomForestClassifier ()),]) 

text_clf_MNB = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),])

RF_parameters = {'vect__max_features': (100,1000,5000,10000),
              'vect__stop_words': ('english', None),
              'tfidf__use_idf': (True, False),              
              'clf__n_estimators': (5, 30, 100),
              'clf__criterion': ('gini', 'entropy'),
              'clf__max_depth': (5, 15, 50),
              }

MNB_parameters = {'vect__max_features': (100,1000,5000,10000),
              'vect__stop_words': ('english', None),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (0.1, 1.2),
              }

gs_RF = GridSearchCV(text_clf_RF, RF_parameters, n_jobs=-1, cv=3, scoring = 'f1_weighted')
gs_MNB = GridSearchCV(text_clf_MNB, MNB_parameters, n_jobs=-1, cv=3, scoring = 'f1_weighted')

gs_RF = gs_RF.fit(twenty_train.data, twenty_train.target)
gs_MNB = gs_MNB.fit(twenty_train.data, twenty_train.target)

In [7]:
from sklearn.metrics import confusion_matrix, classification_report

print("Random Forest:")
prediction_RF = gs_RF.predict(twenty_test.data)
print(gs_RF.best_params_)
print(gs_RF.best_score_)
print (confusion_matrix(twenty_test.target, prediction_RF))
print(classification_report(twenty_test.target, prediction_RF))

print("Multinomial NB:")
prediction_MNB = gs_MNB.predict(twenty_test.data)
print(gs_MNB.best_params_)
print(gs_MNB.best_score_)
print (confusion_matrix(twenty_test.target, prediction_MNB))
print(classification_report(twenty_test.target, prediction_MNB))

Random Forest:
{'clf__criterion': 'entropy', 'clf__max_depth': 15, 'clf__n_estimators': 100, 'tfidf__use_idf': False, 'vect__max_features': 10000, 'vect__stop_words': 'english'}
0.8756888665931926
[[346  48   1]
 [ 17 368  12]
 [  4  89 306]]
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       395
           1       0.73      0.93      0.82       397
           2       0.96      0.77      0.85       399

    accuracy                           0.86      1191
   macro avg       0.88      0.86      0.86      1191
weighted avg       0.88      0.86      0.86      1191

Multinomial NB:
{'clf__alpha': 0.1, 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}
0.9310400841677736
[[375  10  10]
 [  9 351  37]
 [  4  14 381]]
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       395
           1       0.94      0.88      0.91       397
           2       0.89     

<h>Векторизуем обучающую выборку</h>

In [6]:
import gensim.downloader

glove_model = gensim.downloader.load("glove-wiki-gigaword-100")

In [11]:
glove_model.most_similar("cat")

[('dog', 0.8798074722290039),
 ('rabbit', 0.7424427270889282),
 ('cats', 0.732300341129303),
 ('monkey', 0.7288709878921509),
 ('pet', 0.719014048576355),
 ('dogs', 0.7163872718811035),
 ('mouse', 0.6915250420570374),
 ('puppy', 0.6800068020820618),
 ('rat', 0.6641027331352234),
 ('spider', 0.6501135230064392)]

In [25]:
import pandas as pd

vectorizer = CountVectorizer(stop_words='english')
train_data = vectorizer.fit_transform(twenty_train.data)
words_vocab = vectorizer.get_feature_names_out()
embedding_size = 100

glove_data = pd.DataFrame(np.zeros((train_data.shape[0], embedding_size)))


In [44]:
for i, text in enumerate(twenty_train.data):
    words = text.lower().split()  # Токенизация
    doc_vector = np.zeros(embedding_size)  # Инициализация нулевого вектора
    word_count = 0  # Количество найденных слов

    for word in words:
        if word in glove_model.key_to_index:  # Проверяем, есть ли слово в GloVe
            doc_vector += glove_model[word]  # Суммируем вектора
            word_count += 1
            if (i == 3):
                print(word, ":", glove_model[word])

    if word_count > 0:
        doc_vector /= word_count  # Усредняем

    # Записываем в DataFrame
    glove_data.loc[i] = doc_vector
    if (i == 3):
        print(twenty_train.data[i], ":", doc_vector)

print("Преобразование завершено. Размерность данных:", glove_data.shape)

i : [-0.046539   0.61966    0.56647   -0.46584   -1.189      0.44599
  0.066035   0.3191     0.14679   -0.22119    0.79239    0.29905
  0.16073    0.025324   0.18678   -0.31001   -0.28108    0.60515
 -1.0654     0.52476    0.064152   1.0358    -0.40779   -0.38011
  0.30801    0.59964   -0.26991   -0.76035    0.94222   -0.46919
 -0.18278    0.90652    0.79671    0.24825    0.25713    0.6232
 -0.44768    0.65357    0.76902   -0.51229   -0.44333   -0.21867
  0.3837    -1.1483    -0.94398   -0.15062    0.30012   -0.57806
  0.20175   -1.6591    -0.079195   0.026423   0.22051    0.99714
 -0.57539   -2.7266     0.31448    0.70522    1.4381     0.99126
  0.13976    1.3474    -1.1753     0.0039503  1.0298     0.064637
  0.90887    0.82872   -0.47003   -0.10575    0.5916    -0.4221
  0.57331   -0.54114    0.10768    0.39784   -0.048744   0.064596
 -0.61437   -0.286      0.5067    -0.49758   -0.8157     0.16408
 -1.963     -0.26693   -0.37593   -0.95847   -0.8584    -0.71577
 -0.32343   -0.43121 

In [None]:
def text2vec(text_data):

    vectorizer = CountVectorizer(stop_words='english')
    train_data = vectorizer.fit_transform(text_data)
    words_vocab = vectorizer.get_feature_names_out()
    embedding_size = 100

    glove_data = pd.DataFrame(np.zeros((train_data.shape[0], embedding_size)))

    for i, text in enumerate(text_data):
        words = text.lower().split()  # Токенизация
        doc_vector = np.zeros(embedding_size)  # Инициализация нулевого вектора
        word_count = 0  # Количество найденных слов

        for word in words:
            if word in glove_model.key_to_index:  # Проверяем, есть ли слово в GloVe
                doc_vector += glove_model[word]  # Суммируем вектора
                word_count += 1
                if (i == 3):
                    print(word, ":", glove_model[word])

        if word_count > 0:
            doc_vector /= word_count  # Усредняем

        # Записываем в DataFrame
        glove_data.loc[i] = doc_vector
        if (i == 3):
            print(twenty_train.data[i], ":", doc_vector)

    print("Преобразование завершено. Размерность данных:", glove_data.shape)

    