<h1>А-02-21 Енгоян Сергей ЛР 3</h1>

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
categories = ['comp.windows.x', 'rec.sport.baseball', 'rec.sport.hockey']
remove = ('headers', 'footers', 'quotes')

twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=28, categories=categories, remove=remove)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=28, categories=categories, remove=remove)

In [5]:
text_clf_RF = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', RandomForestClassifier ()),]) 

text_clf_MNB = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),])

RF_parameters = {'vect__max_features': (100,1000,5000,10000),
              'vect__stop_words': ('english', None),
              'tfidf__use_idf': (True, False),              
              'clf__n_estimators': (5, 30, 100),
              'clf__criterion': ('gini', 'entropy'),
              'clf__max_depth': (5, 15, 50),
              }

MNB_parameters = {'vect__max_features': (100,1000,5000,10000),
              'vect__stop_words': ('english', None),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (0.1, 1.2),
              }

gs_RF = GridSearchCV(text_clf_RF, RF_parameters, n_jobs=-1, cv=3, scoring = 'f1_weighted')
gs_MNB = GridSearchCV(text_clf_MNB, MNB_parameters, n_jobs=-1, cv=3, scoring = 'f1_weighted')

gs_RF = gs_RF.fit(twenty_train.data, twenty_train.target)
gs_MNB = gs_MNB.fit(twenty_train.data, twenty_train.target)

In [7]:
from sklearn.metrics import confusion_matrix, classification_report

print("Random Forest:")
prediction_RF = gs_RF.predict(twenty_test.data)
print(gs_RF.best_params_)
print(gs_RF.best_score_)
print (confusion_matrix(twenty_test.target, prediction_RF))
print(classification_report(twenty_test.target, prediction_RF))

print("Multinomial NB:")
prediction_MNB = gs_MNB.predict(twenty_test.data)
print(gs_MNB.best_params_)
print(gs_MNB.best_score_)
print (confusion_matrix(twenty_test.target, prediction_MNB))
print(classification_report(twenty_test.target, prediction_MNB))

Random Forest:
{'clf__criterion': 'entropy', 'clf__max_depth': 15, 'clf__n_estimators': 100, 'tfidf__use_idf': False, 'vect__max_features': 10000, 'vect__stop_words': 'english'}
0.8756888665931926
[[346  48   1]
 [ 17 368  12]
 [  4  89 306]]
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       395
           1       0.73      0.93      0.82       397
           2       0.96      0.77      0.85       399

    accuracy                           0.86      1191
   macro avg       0.88      0.86      0.86      1191
weighted avg       0.88      0.86      0.86      1191

Multinomial NB:
{'clf__alpha': 0.1, 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}
0.9310400841677736
[[375  10  10]
 [  9 351  37]
 [  4  14 381]]
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       395
           1       0.94      0.88      0.91       397
           2       0.89     

<h>Векторизуем обучающую выборку</h>

In [6]:
import gensim.downloader

glove_model = gensim.downloader.load("glove-wiki-gigaword-100")

In [192]:
glove_model.most_similar("cat")

[('dog', 0.8798074722290039),
 ('rabbit', 0.7424427270889282),
 ('cats', 0.732300341129303),
 ('monkey', 0.7288709878921509),
 ('pet', 0.719014048576355),
 ('dogs', 0.7163872718811035),
 ('mouse', 0.6915250420570374),
 ('puppy', 0.6800068020820618),
 ('rat', 0.6641027331352234),
 ('spider', 0.6501135230064392)]

In [193]:
import pandas as pd

In [194]:
vectorizer = CountVectorizer(stop_words='english')
train_data = vectorizer.fit_transform(twenty_train['data'])
CV_data=pd.DataFrame(train_data.toarray(), columns=vectorizer.get_feature_names_out())
words_vocab=CV_data.columns

In [195]:
CV_data

Unnamed: 0,00,000,0000,00000000,00000074,00000093,000000e5,000005102000,00000510200001,00000ee5,...,zooms,zpixmap,ztivax,zubov,zuma,zupancic,zupcic,zurich,zzzzzz,zzzzzzt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1785,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1787,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [211]:
def text2vec(text_data):
    X = vectorizer.transform(text_data)
    CV_text_data=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    CV_text_data

    glove_data=pd.DataFrame()

    for i in range(CV_text_data.shape[0]):
        one_doc = np.zeros(100) 
        
        for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
            if word in glove_model.key_to_index.keys(): 
                one_doc += glove_model[word]

        glove_data=pd.concat([glove_data, pd.DataFrame([one_doc])]) 
    return glove_data

    

In [212]:
train_data_glove = text2vec(twenty_train['data'])
train_data_glove

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-1.802762,-1.916514,5.472597,-8.130228,-2.016533,0.261423,0.471270,3.956486,-4.629046,-3.603064,...,2.537454,-3.699753,-0.550084,-1.390797,-8.852982,-0.080768,-1.450536,-0.647836,5.364653,2.301254
0,2.953440,4.005725,25.814343,-30.140322,-18.340198,7.794250,4.487397,5.520217,-4.862631,-10.701652,...,-6.996410,-4.326796,-12.399193,8.224525,-28.515959,-4.598999,-10.521591,-2.853545,6.758008,11.597640
0,-0.954999,1.929175,10.614173,-28.361270,7.790991,7.808651,28.265219,5.091243,-37.097965,-6.522101,...,-15.379065,-4.843169,-3.871302,8.972648,-38.334385,2.432221,-13.361095,26.352092,18.890035,-17.538002
0,2.169567,-0.783640,2.556040,-5.433034,-3.848164,2.388939,0.875124,-1.419949,1.563489,-3.132857,...,-0.565526,-0.339456,-0.663260,1.597031,-1.655165,-0.288279,-1.986810,-0.976680,1.120928,0.486246
0,-0.937480,2.515804,4.916899,-2.535192,-1.531810,0.384548,-4.356812,0.196522,-1.500154,0.027226,...,-0.961028,-0.103143,-0.669155,-2.827078,-6.941850,-2.161503,-3.673758,-4.146504,3.258953,1.585434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-7.833503,11.810429,8.280787,-7.248842,1.842863,3.824204,-3.443792,2.402598,2.907838,-3.039019,...,-1.878268,7.366170,0.487987,0.186610,2.343636,-2.450333,-5.485956,-15.403860,13.940103,9.492050
0,-0.281965,3.699641,8.483970,-1.721913,-4.532177,4.614038,1.098552,-2.069401,-4.286052,-4.018002,...,-5.626140,1.398306,-1.257002,-2.362422,-2.848462,-1.510771,-4.306250,-3.422756,2.613457,3.818592
0,-13.761515,2.432639,13.806734,-5.083889,-0.692241,1.184530,-3.129054,7.152224,1.569194,2.345650,...,-2.702854,7.996242,-5.975296,-0.541540,-9.969251,1.624959,-1.508043,-12.247111,16.716012,0.023420
0,-7.511736,18.076480,8.069505,-4.349498,12.932268,4.036430,-7.540680,2.843563,0.508248,-1.247285,...,-7.817685,5.326844,4.517724,-10.893979,-12.810062,-3.276873,-10.547428,-19.512687,32.992341,7.764178


In [213]:
test_data_glove = text2vec(twenty_test['data'])
test_data_glove

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-6.650751,3.714569,6.508652,-2.719378,4.427562,7.690161,-6.766014,3.306249,3.120109,-5.167049,...,-6.299061,9.371367,9.176817,-3.930847,-0.064854,-1.326194,-0.387772,-7.038005,15.113263,4.118097
0,3.075488,-0.646264,8.879705,-5.235366,-11.169950,2.159033,-2.438678,0.557687,-4.824220,-4.184909,...,-2.407467,-3.780872,-8.429704,0.685585,-1.061549,1.975551,-2.266881,1.863345,5.931201,3.676485
0,-5.848390,10.044284,9.277847,-6.543268,-9.733286,6.054838,-3.885795,3.321960,-16.731579,-12.587014,...,-1.300718,-6.195644,-3.408787,-11.330779,-11.902068,-4.266645,-11.153696,2.450430,23.044766,8.618176
0,9.689988,3.765853,11.390716,-16.832048,-2.222389,8.293272,3.303914,1.327348,-7.366230,-2.947535,...,-0.597256,-6.709495,-4.840572,-0.179670,-19.611634,1.277730,-5.699159,4.283576,8.734602,-1.219709
0,6.046416,9.212171,23.731612,-16.770103,-6.399876,7.651130,2.582575,7.604413,-12.812021,-12.856395,...,-5.478889,-7.515360,-0.556940,-1.163514,-27.470242,-3.299861,-12.208818,2.117422,20.617426,4.508136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-0.196185,20.991767,23.401272,-24.073394,-19.736613,6.239452,9.349141,-7.951416,-36.304919,-13.557415,...,0.033848,-13.580938,-6.138717,-1.880064,-26.306484,3.669983,-11.270863,-1.724129,16.177676,-10.515575
0,-1.463923,1.643600,3.673456,-1.849041,-1.966477,-0.213470,-1.950146,0.156367,-4.127250,-2.986270,...,-0.896351,-1.584707,0.050629,-1.604158,-4.579860,-0.557735,-1.677558,-0.739790,1.736218,0.901630
0,-7.530271,21.225815,56.308102,-55.118356,1.707084,8.131573,16.509390,1.915660,-83.345346,-33.841147,...,-11.875677,-25.829769,-30.985395,2.929970,-76.970169,10.203374,-15.875692,9.545149,57.488785,-8.511469
0,-0.529738,2.037353,1.647381,-2.772957,1.940459,1.616843,-0.040073,0.942392,0.362945,-0.353878,...,0.189887,-1.439393,0.878836,-1.789950,0.246150,0.176463,-1.031042,-3.506571,5.128784,1.173734


In [215]:
RF_parameters_2 = {              
              'n_estimators': (5, 30, 100),
              'criterion': ('gini', 'entropy'),
              'max_depth': (5, 15, 50),
              }

MNB_parameters_2 = {
              'alpha': (0.1, 1.2),
              }

In [216]:
gs_RF_glove = GridSearchCV(RandomForestClassifier(), RF_parameters_2, n_jobs=-1, cv=3, scoring = 'f1_weighted')
gs_MNB_glove = GridSearchCV(MultinomialNB(), MNB_parameters_2, n_jobs=-1, cv=3, scoring = 'f1_weighted')

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_data_glove_scaled = scaler.fit_transform(train_data_glove)
test_data_glove_scaled = scaler.transform(test_data_glove)

gs_RF_glove = gs_RF_glove.fit(train_data_glove, twenty_train.target)
gs_MNB_glove = gs_MNB_glove.fit(train_data_glove_scaled, twenty_train.target)

In [219]:
predict_RF = gs_RF_glove.predict(test_data_glove)
predict_MNB = gs_MNB_glove.predict(test_data_glove_scaled)

In [220]:
print("Random Forest:")
print(gs_RF_glove.best_params_)
print(gs_RF_glove.best_score_)
print (confusion_matrix(twenty_test['target'], predict_RF))
print(classification_report(twenty_test['target'], predict_RF))

print("Multinomial NB:")
print(gs_MNB_glove.best_params_)
print(gs_MNB_glove.best_score_)
print (confusion_matrix(twenty_test['target'], predict_MNB))
print(classification_report(twenty_test['target'], predict_MNB))

Random Forest:
{'criterion': 'gini', 'max_depth': 50, 'n_estimators': 100}
0.8615516647340774
[[366  22   7]
 [ 17 345  35]
 [  7  70 322]]
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       395
           1       0.79      0.87      0.83       397
           2       0.88      0.81      0.84       399

    accuracy                           0.87      1191
   macro avg       0.87      0.87      0.87      1191
weighted avg       0.87      0.87      0.87      1191

Multinomial NB:
{'alpha': 0.1}
0.5799808813476864
[[359  33   3]
 [ 32 278  87]
 [ 16 231 152]]
              precision    recall  f1-score   support

           0       0.88      0.91      0.90       395
           1       0.51      0.70      0.59       397
           2       0.63      0.38      0.47       399

    accuracy                           0.66      1191
   macro avg       0.67      0.66      0.65      1191
weighted avg       0.67      0.66      0.65      1191



<table>
        <thead>
            <tr>
                <th>Метод</th>
                <th>Векторизация</th>
                <th>Параметры</th>
                <th>F1-score</th>
                <th>Accuracy</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>Random Forest</td>
                <td>TF-IDF</td>
                <td><code>criterion='entropy', max_depth=15, n_estimators=100, use_idf=False, max_features=10000, stop_words='english'</code></td>
                <td>0.8757</td>
                <td>0.86</td>
            </tr>
            <tr>
                <td>MultinomialNB</td>
                <td>TF-IDF</td>
                <td><code>alpha=0.1, use_idf=True, max_features=10000, stop_words='english'</code></td>
                <td>0.9310</td>
                <td>0.93</td>
            </tr>
            <tr>
                <td>Random Forest</td>
                <td>GloVe</td>
                <td><code>criterion='gini', max_depth=50, n_estimators=100б stop_words='english'</code></td>
                <td>0.8616</td>
                <td>0.87</td>
            </tr>
            <tr>
                <td>MultinomialNB</td>
                <td>GloVe</td>
                <td><code>alpha=0.1б stop_words='english'</code></td>
                <td>0.5800</td>
                <td>0.66</td>
            </tr>
        </tbody>
    </table>