Предварительно про PyTorch:
* [Про тензоры в pytorch](https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/tensor_tutorial.ipynb)
* [Про автоматическое дифференцирование и что такое .backwards()](https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/autograd_tutorial.ipynb)
* [Очень простая нейронка на pytorch](https://colab.research.google.com/drive/1RsZvw4KBGn5U5Aj5Ak7OG2pHx6z1OSlF)

#Загрузка данных

In [None]:
!wget https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv

--2023-05-09 07:34:37--  https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1253562 (1.2M) [text/plain]
Saving to: ‘Constraint_Train.csv.1’


2023-05-09 07:34:37 (55.6 MB/s) - ‘Constraint_Train.csv.1’ saved [1253562/1253562]



In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('Constraint_Train.csv')

In [None]:
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


# Методы sklearn

## модели из лекции

In [None]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet)] # токенизируем текст

100%|██████████| 6420/6420 [00:02<00:00, 2950.80it/s]


### __model__ (Word2vec + LogisticRegression)

In [None]:
from gensim.models.word2vec import Word2Vec # векторизуем тексты с помощью словарных эмбендингов
%time model_tweets = Word2Vec(sentences, workers=4, vector_size=300, min_count=3, window=5, epochs=15)

CPU times: user 8.48 s, sys: 45.1 ms, total: 8.52 s
Wall time: 4.64 s


In [None]:
model_tweets.wv.most_similar('vaccine') # проверка на адекватность - смотрим близкие слова к слову "вакцина"

[('cure', 0.7973955273628235),
 ('drug', 0.7808627486228943),
 ('scientists', 0.7612375617027283),
 ('developed', 0.7383346557617188),
 ('fight', 0.7292054295539856),
 ('pandemic', 0.7139097452163696),
 ('against', 0.7109015583992004),
 ('remedy', 0.7073735594749451),
 ('company', 0.7011401653289795),
 ('novel', 0.6907817125320435)]

In [None]:
model_tweets.wv.most_similar('quarantine') # проверка на адекватность - смотрим близкие слова к слову "карантин"

[('isolation', 0.945399820804596),
 ('managed', 0.9424483776092529),
 ('facility', 0.8861786127090454),
 ('auckland', 0.8855910301208496),
 ('facilities', 0.8798918128013611),
 ('cluster', 0.829450786113739),
 ('christchurch', 0.818962812423706),
 ('hospitals', 0.7934414744377136),
 ('border', 0.7895841598510742),
 ('none', 0.7835988998413086)]

In [None]:
model_tweets.init_sims() # нормируем вектора, чтобы они были в одинаковом пространстве

In [None]:
import numpy as np

In [None]:
def get_text_embedding(text):
    """создать эмбединг текста"""
    result = []
    for word in word_tokenize(text.lower()):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word])

    if len(result):
        result = np.sum(result, axis=0)
    else:
        result = np.zeros(300)
    return result

In [None]:
features = [get_text_embedding(text) for text in tqdm(df.tweet)] # вектора признаков

100%|██████████| 6420/6420 [00:02<00:00, 2287.67it/s]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, df.label, test_size=0.33, random_state=0)

In [None]:
model = LogisticRegression(random_state=0)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

In [None]:
predicted = model.predict(X_test)

In [None]:
print(classification_report(y_test, predicted, digits = 3))

              precision    recall  f1-score   support

        fake      0.906     0.917     0.912      1018
        real      0.923     0.912     0.917      1101

    accuracy                          0.915      2119
   macro avg      0.914     0.915     0.914      2119
weighted avg      0.915     0.915     0.915      2119



### __model_1__ (CountVectorizer + LogisticRegression)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec = CountVectorizer()

In [None]:
bow = vec.fit_transform(df.tweet)

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(bow, df.label, test_size=0.33, random_state=0)
model_1 = LogisticRegression(random_state=0)
model_1.fit(X1_train, y1_train)

In [None]:
predicted_1 = model_1.predict(X1_test)
print(classification_report(y1_test, predicted_1))

              precision    recall  f1-score   support

        fake       0.93      0.93      0.93      1018
        real       0.93      0.93      0.93      1101

    accuracy                           0.93      2119
   macro avg       0.93      0.93      0.93      2119
weighted avg       0.93      0.93      0.93      2119



## мои модели

### __model_1_clean__  без стоп-слов (CountVectorizer + LogisticRegression)

In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
noise = stopwords.words('english') + list(punctuation)

In [None]:
vec_1 = CountVectorizer(stop_words=noise)
bow_1 = vec_1.fit_transform(df.tweet)
X2_train, X2_test, y2_train, y2_test = train_test_split(bow_1, df.label, test_size=0.33, random_state=0)
model_1_clean = LogisticRegression(random_state=0)
model_1_clean.fit(X2_train, y2_train)
predicted_2 = model_1_clean.predict(X2_test)
print(classification_report(y2_test, predicted_2))

              precision    recall  f1-score   support

        fake       0.93      0.94      0.93      1018
        real       0.94      0.93      0.94      1101

    accuracy                           0.94      2119
   macro avg       0.94      0.94      0.94      2119
weighted avg       0.94      0.94      0.94      2119



### __ppl_clf_1__  (CountVectorizer + TfidfTransformer + TruncatedSVD + LogisticRegression)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6420 entries, 0 to 6419
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6420 non-null   int64 
 1   tweet   6420 non-null   object
 2   label   6420 non-null   object
dtypes: int64(1), object(2)
memory usage: 150.6+ KB


In [None]:
df.label.value_counts(dropna=False) # смотрим сбалансированность данных

real    3360
fake    3060
Name: label, dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
le.fit(df.label)

In [None]:
le.transform( ['real', 'fake']) # пример расшифровки

array([1, 0])

In [None]:
y = pd.Series (le.transform(df.label))
y.head(3)

0    1
1    1
2    0
dtype: int64

In [None]:
X=df.tweet

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
print(df.shape[0])
print(X_train.shape)
print(X_test.shape)

6420
(4301,)
(2119,)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix

In [None]:
ppl_clf_1 = Pipeline([
    ('vect', CountVectorizer(stop_words=noise)),
    ('tfidf', TfidfTransformer()),
    ('tm', TruncatedSVD(n_components=100)),
    ('clf', LogisticRegression(random_state=0))])

In [None]:
ppl_clf_1.fit(X_train, y_train)

In [None]:
pred_ppl_clf_1 = ppl_clf_1.predict(X_test)
print(classification_report(pred_ppl_clf_1, y_test, target_names=['fake', 'real']))

              precision    recall  f1-score   support

        fake       0.87      0.91      0.89       980
        real       0.92      0.89      0.90      1139

    accuracy                           0.90      2119
   macro avg       0.89      0.90      0.89      2119
weighted avg       0.90      0.90      0.90      2119



In [None]:
ppl_clf_1.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'tm', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'tm__algorithm', 'tm__n_components', 'tm__n_iter', 'tm__n_oversamples', 'tm__power_iteration_normalizer', 'tm__random_state', 'tm__tol', 'clf__C', 'clf__class_weight', 'clf__dual', 'clf__fit_intercept', 'clf__intercept_scaling', 'clf__l1_ratio', 'clf__max_iter', 'clf__multi_class', 'clf__n_jobs', 'clf__penalty', 'clf__random_state', 'clf__solver', 'clf__tol', 'clf__verbose', 'clf__warm_start'])

Далее подбираю оптимальные параметры, делаю это частями, иначе слишком долго отрабатывается код (более 2.5часов на GPU), колаб постоянно меня выкидывает

In [None]:
hyperparams_1 = {
    'vect__ngram_range': [(1, 1), (2, 2), (3, 3)],
    'vect__max_df': [0.85, 0.95],
    'vect__min_df': [1,2,3,4],
    'vect__stop_words': [None, noise],
    'vect__max_features': [None, 10, 100, 500, 1000, 5000, 10000],
    'vect__strip_accents': ['ascii', 'unicode', None],
}

grid1 = GridSearchCV(ppl_clf_1, hyperparams_1, cv=4, n_jobs=-1).fit(X_train, y_train)
print('Оптимальные параметры для grid1:')
print(grid1.best_score_, grid1.best_params_)

576 fits failed out of a total of 4032.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
576 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/usr/local/lib/python3.10/dist-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/usr/l

Оптимальные параметры для grid1:
0.9081604996974151 {'vect__max_df': 0.95, 'vect__max_features': None, 'vect__min_df': 3, 'vect__ngram_range': (1, 1), 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'ther

In [None]:
noise_cor = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

In [None]:
ppl_clf_1 = Pipeline([
    ('vect', CountVectorizer(max_df=0.95, max_features=None, min_df=3, ngram_range=(1, 1), stop_words=noise_cor, strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('tm', TruncatedSVD(n_components=100)),
    ('clf', LogisticRegression(random_state=0))])

In [None]:
ppl_clf_1.fit(X_train, y_train)

In [None]:
pred_ppl_clf_1 = ppl_clf_1.predict(X_test)
print(classification_report(pred_ppl_clf_1, y_test, target_names=['fake', 'real']))

              precision    recall  f1-score   support

        fake       0.89      0.90      0.89      1009
        real       0.90      0.90      0.90      1110

    accuracy                           0.90      2119
   macro avg       0.90      0.90      0.90      2119
weighted avg       0.90      0.90      0.90      2119



In [None]:
hyperparams_1 = {
    'tfidf__norm': ['l1', 'l2', None],
    'clf__class_weight':[None, 'balanced'],
    'clf__penalty':[None, 'l2', 'l1', 'elasticnet'],
    'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}

grid1 = GridSearchCV(ppl_clf_1, hyperparams_1, cv=4, n_jobs=-1).fit(X_train, y_train)
print('Оптимальные параметры для grid1:')
print(grid1.best_score_, grid1.best_params_)

216 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1216, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 1223, in _fit_liblinear
   

Оптимальные параметры для grid1:
0.915599982709432 {'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'lbfgs', 'tfidf__norm': None}


In [None]:
ppl_clf_1 = Pipeline([
    ('vect', CountVectorizer(max_df=0.95, max_features=None, min_df=3, ngram_range=(1, 1), stop_words=noise_cor, strip_accents='unicode')),
    ('tfidf', TfidfTransformer(norm=None)),
    ('tm', TruncatedSVD(n_components=100)),
    ('clf', LogisticRegression(class_weight='balanced', penalty='l2', solver='lbfgs', random_state=0))])

In [None]:
ppl_clf_1.fit(X_train, y_train)

In [None]:
pred_ppl_clf_1 = ppl_clf_1.predict(X_test)
print(classification_report(pred_ppl_clf_1, y_test, target_names=['fake', 'real'], digits = 3))

              precision    recall  f1-score   support

        fake      0.924     0.896     0.910      1050
        real      0.901     0.928     0.914      1069

    accuracy                          0.912      2119
   macro avg      0.913     0.912     0.912      2119
weighted avg      0.913     0.912     0.912      2119



### RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from tqdm import tqdm

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
models=[
      {'name':'Lr',"model": LogisticRegression(random_state=0),
       'params':
              {'C':np.linspace(0, 10, 5),
                'penalty':['l1', 'l2', 'elasticnet', 'none'],
                'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
              }
      },
      {'name':'SVC',"model": SVC(random_state=0),
       'params':
              {'C': np.linspace(0, 10, 5),
               'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
               'degree':[1,2,3,5]
               }
       },
      {'name':'RF',"model": RandomForestClassifier(random_state=0),
       'params':
              {'n_estimators':[10,25,50,100,150,200],
               'criterion':['gini', 'entropy'],
               'max_depth':[3,5,7,9,11],
               'min_samples_leaf':[1,2,3,5]
               }
       },
      {'name':'KNN',"model": KNeighborsClassifier(),
       'params':
              {'n_neighbors':list(range(1,30)),
               'weights': ['uniform', 'distance'],
               'p':[1,2,3],
               'metric':['euclidean', 'minkowski']
               }
       },
      {'name':'DT',"model": DecisionTreeClassifier(random_state=0),
       'params':
              {'criterion':['gini', 'entropy'],
               'max_depth':[3,5,7,9,11],
               'min_samples_split':[2,3,4,5,7,9],
               'min_samples_leaf':[1,2,3,5]
               }
       }
]

In [None]:
%%time


res = []
for v in tqdm(models):
    res.append((v['name'], RandomizedSearchCV(v['model'], v['params'],cv=4, random_state=0).fit(X2_train, y2_train)))

100%|██████████| 5/5 [03:07<00:00, 37.49s/it]

CPU times: user 3min, sys: 1min 11s, total: 4min 11s
Wall time: 3min 7s





In [None]:
for r in res:
    print(r[0], r[1].best_score_, r[1].best_params_)

Lr 0.9269942508861415 {'solver': 'lbfgs', 'penalty': 'l2', 'C': 2.5}
SVC 0.9246684533586929 {'kernel': 'rbf', 'degree': 5, 'C': 2.5}
RF 0.8890939742370537 {'n_estimators': 200, 'min_samples_leaf': 2, 'max_depth': 11, 'criterion': 'entropy'}
KNN 0.7551726895478517 {'weights': 'distance', 'p': 2, 'n_neighbors': 8, 'metric': 'euclidean'}
DT 0.8667733638800035 {'min_samples_split': 3, 'min_samples_leaf': 3, 'max_depth': 11, 'criterion': 'gini'}


#### model_1_clean_1 (CountVectorizer + SVC)

In [None]:
model_1_clean_1 = SVC(kernel='rbf', degree=5, C=2.5, random_state=0)
model_1_clean_1.fit(X2_train, y2_train)
predicted_3 = model_1_clean_1.predict(X2_test)
print(classification_report(y2_test, predicted_3))

              precision    recall  f1-score   support

        fake       0.94      0.92      0.93      1018
        real       0.93      0.95      0.94      1101

    accuracy                           0.94      2119
   macro avg       0.94      0.94      0.94      2119
weighted avg       0.94      0.94      0.94      2119



## сводная таблица по моделям

In [None]:
report = classification_report(y_test, predicted, output_dict=True)
df_r = pd.DataFrame(report)
test=pd.Series(round(df_r[2:3]['macro avg'],3), name='model (Word2vec + LogisticRegression)')
test=pd.DataFrame(test)

In [None]:
report_1 = classification_report(y1_test, predicted_1, output_dict=True)
df_r_1 = pd.DataFrame(report_1)
test_1=pd.Series(round(df_r_1[2:3]['macro avg'],3), name='model_1 (CountVectorizer + LogisticRegression)')
test_1=pd.DataFrame(test_1)

In [None]:
report_2 = classification_report(y2_test, predicted_2, output_dict=True)
df_r_2 = pd.DataFrame(report_2)
test_2=pd.Series(round(df_r_2[2:3]['macro avg'],3), name='model_1_clean без стоп-слов (CountVectorizer + LogisticRegression)')
test_2=pd.DataFrame(test_2)

In [None]:
report_3 = classification_report(y_test, pred_ppl_clf_1, output_dict=True)
df_r_3 = pd.DataFrame(report_3)
test_3=pd.Series(round(df_r_3[2:3]['macro avg'],3), name='ppl_clf_1 (CountVectorizer + TfidfTransformer + TruncatedSVD + LogisticRegression)')
test_3=pd.DataFrame(test_3)

In [None]:
report_4 = classification_report(y2_test, predicted_3, output_dict=True, digits = 3)
df_r_4 = pd.DataFrame(report_4)
test_4=pd.Series(round(df_r_4[2:3]['macro avg'],3), name='model_1_clean_1 (CountVectorizer + SVC)')
test_4=pd.DataFrame(test_4)

In [None]:
test_concat = pd.concat([test, test_1, test_2,test_3, test_4], axis=1)

In [None]:
test_concat.style.format('{:.3f}').highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,model (Word2vec + LogisticRegression),model_1 (CountVectorizer + LogisticRegression),model_1_clean без стоп-слов (CountVectorizer + LogisticRegression),ppl_clf_1 (CountVectorizer + TfidfTransformer + TruncatedSVD + LogisticRegression),model_1_clean_1 (CountVectorizer + SVC)
f1-score,0.914,0.93,0.936,0.912,0.936


# Методы на PyTorch

## модель из лекции (PyTorch + LSTM)

In [None]:
labels = (df.label == 'real').astype(int).to_list() # переводим метки в числа

In [None]:
labels[:5]

[1, 1, 0, 1, 1]

Нужно заранее задать размер для макксимальной длины предложений.

In [None]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import numpy as np

In [None]:
token_lists = [word_tokenize(text.lower()) for text in df.tweet] # токенизируем тексты
max_len = len(max(token_lists, key=len))

In [None]:
max_len

1592

Это слишком много. Но какая длина обычно?

In [None]:
from collections import Counter
fd = Counter([len(tokens) for tokens in token_lists]) # смотрим какие длины чаще встречаются

In [None]:
fd.most_common(10)

[(20, 178),
 (25, 174),
 (22, 170),
 (18, 170),
 (19, 168),
 (21, 168),
 (16, 163),
 (17, 162),
 (15, 160),
 (23, 156)]

Зададим максимум 200.

Возьмём те же w2v эмбеддинги.

In [None]:
sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:01<00:00, 3311.59it/s]


In [None]:
from gensim.models.word2vec import Word2Vec
%time model_tweets = Word2Vec(sentences, workers=4, vector_size=300, min_count=3, window=5, epochs=15)

CPU times: user 10.3 s, sys: 88.8 ms, total: 10.3 s
Wall time: 7.08 s


In [None]:
def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    return result

In [None]:
features = [get_word_embedding(text, 200) for text in tqdm(token_lists)] # список векторов каждого слова в тексте

100%|██████████| 6420/6420 [00:04<00:00, 1560.19it/s]


In [None]:
features[:2]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
len(features[0][0])

300

In [None]:
len(X_train)

4301

In [None]:
len(X_train[0])

200

In [None]:
len(X_train[0][0])

300

In [None]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

  in_data = torch.tensor(X_train).float()


In [None]:
in_data.shape

torch.Size([4301, 200, 300])

In [None]:
# Сети с долговременной кратковременной памятью (LSTM) представляют собой особый вид RNN, которые способны изучать долгосрочные зависимости.
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(300, 100) # на вход - вектор размерностью 300, скрытый слой задали 100
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1)) # longterm - линейный слой долгосрочной памяти
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net = Net()
print(net)

Net(
  (lstm): LSTM(300, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [None]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCELoss()

In [None]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
train_one_epoch(in_data, targets)

100%|██████████| 269/269 [05:13<00:00,  1.16s/it]

tensor(0.6808, grad_fn=<BinaryCrossEntropyBackward0>)





Что получилось?

In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.5002359603586597

Но такую модель надо учить дольше(

## оптимизация модели из лекции

### увеличиваем количество эпох до 10

In [None]:
for i in range(10):
  train_one_epoch(in_data, targets)

100%|██████████| 269/269 [05:09<00:00,  1.15s/it]


tensor(0.6796, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:20<00:00,  1.19s/it]


tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:35<00:00,  1.25s/it]


tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:07<00:00,  1.14s/it]


tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:20<00:00,  1.19s/it]


tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:15<00:00,  1.17s/it]


tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:02<00:00,  1.13s/it]


tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:04<00:00,  1.13s/it]


tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:05<00:00,  1.14s/it]


tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:01<00:00,  1.12s/it]

tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.5002359603586597

### заменой оптимизатора на Adam

Замена оптимизатора SGD на оптимизатор __Rectified Adam PyTorch__.

Исправленный оптимизатор Адама Pytorch — это альтернатива оптимизатору Адама, который пытается решить проблему плохой сходимости Адама.
Он также используется для исправления изменений скорости адаптивного обучения.

In [None]:
optimizer_RAdam = optim.RAdam(net.parameters(),lr=0.001,betas=(0.9,0.999),eps=1e-08,weight_decay=0)

In [None]:
criterion = nn.BCELoss()

In [None]:
def train_one_epoch_RAdam(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer_RAdam.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer_RAdam.step()
    print(loss)

In [None]:
for i in range(10):
  train_one_epoch_RAdam(in_data, targets)

100%|██████████| 269/269 [03:06<00:00,  1.45it/s]


tensor(0.6912, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [03:28<00:00,  1.29it/s]


tensor(0.6911, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [03:48<00:00,  1.18it/s]


tensor(0.6911, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [03:55<00:00,  1.14it/s]


tensor(0.6910, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [03:53<00:00,  1.15it/s]


tensor(0.6909, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [03:54<00:00,  1.15it/s]


tensor(0.6909, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [03:56<00:00,  1.14it/s]


tensor(0.6909, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [03:55<00:00,  1.14it/s]


tensor(0.6908, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [03:56<00:00,  1.14it/s]


tensor(0.6908, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [03:57<00:00,  1.13it/s]

tensor(0.6908, grad_fn=<BinaryCrossEntropyBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.5252477583765928

### добавление скрытых слоёв

In [None]:
# Добавим еще два скрытых слоя
class Net_1(nn.Module):

    def __init__(self):
        super(Net_1, self).__init__()
        self.lstm = nn.LSTM(300, 100)
        self.Linear = nn.Linear(100, 100)
        self.Linear = nn.Linear(100, 100)
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net_1 = Net_1()
print(net_1)

Net_1(
  (lstm): LSTM(300, 100)
  (Linear): Linear(in_features=100, out_features=100, bias=True)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [None]:
optimizer_RAdam_1 = optim.RAdam(net_1.parameters(),lr=0.001,betas=(0.9,0.999),eps=1e-08,weight_decay=0)

In [None]:
criterion = nn.BCELoss()

In [None]:
def train_one_epoch_RAdam_1(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer_RAdam_1.zero_grad()
        output = net_1(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer_RAdam_1.step()
    print(loss)

In [None]:
#for i in range(10): # включился лимит на ускоритель, поэтому делаю 1 эпоху, десять всё равно мало, для видимого результата надо добавлять больше 10 эпох.
train_one_epoch_RAdam_1(in_data, targets)

100%|██████████| 269/269 [05:27<00:00,  1.22s/it]

tensor(0.7120, grad_fn=<BinaryCrossEntropyBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net_1(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.5214723926380368

### добавление функции активации скрытого слоя

In [None]:
# Добавим функцию активации скрытого слоя ReLU
class Net_1(nn.Module):

    def __init__(self):
        super(Net_1, self).__init__()
        self.lstm = nn.LSTM(300, 100)
        self.Linear = nn.Linear(100, 100)
        self.ReLU=nn.ReLU()
        self.Linear = nn.Linear(100, 100)
        self.ReLU=nn.ReLU()
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net_1 = Net_1()
print(net_1)

Net_1(
  (lstm): LSTM(300, 100)
  (Linear): Linear(in_features=100, out_features=100, bias=True)
  (ReLU): ReLU()
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [None]:
optimizer_RAdam_1 = optim.RAdam(net_1.parameters(),lr=0.001,betas=(0.9,0.999),eps=1e-08,weight_decay=0)

In [None]:
criterion = nn.BCELoss()

In [None]:
def train_one_epoch_RAdam_1(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer_RAdam_1.zero_grad()
        output = net_1(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer_RAdam_1.step()
    print(loss)

In [None]:
for i in range(10):
  train_one_epoch_RAdam_1(in_data, targets)

100%|██████████| 269/269 [05:13<00:00,  1.17s/it]


tensor(0.7119, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:19<00:00,  1.19s/it]


tensor(0.7126, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:30<00:00,  1.23s/it]


tensor(0.7122, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:32<00:00,  1.24s/it]


tensor(0.7117, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:23<00:00,  1.20s/it]


tensor(0.7110, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:14<00:00,  1.17s/it]


tensor(0.7105, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:03<00:00,  1.13s/it]


tensor(0.7100, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:05<00:00,  1.14s/it]


tensor(0.7097, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:40<00:00,  1.26s/it]


tensor(0.7092, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [06:04<00:00,  1.36s/it]

tensor(0.7088, grad_fn=<BinaryCrossEntropyBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net_1(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.5214723926380368

### + 10 эпох

In [None]:
for i in range(10):
  train_one_epoch_RAdam_1(in_data, targets)

100%|██████████| 269/269 [05:57<00:00,  1.33s/it]


tensor(0.7085, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:34<00:00,  1.24s/it]


tensor(0.7083, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [06:22<00:00,  1.42s/it]


tensor(0.7080, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [06:01<00:00,  1.34s/it]


tensor(0.7078, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:55<00:00,  1.32s/it]


tensor(0.7076, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:51<00:00,  1.31s/it]


tensor(0.7074, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [05:45<00:00,  1.28s/it]


tensor(0.7074, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [06:04<00:00,  1.35s/it]


tensor(0.7071, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [06:08<00:00,  1.37s/it]


tensor(0.7070, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [06:31<00:00,  1.46s/it]

tensor(0.7069, grad_fn=<BinaryCrossEntropyBackward0>)





In [None]:
with torch.no_grad():
    output = net_1(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.5214723926380368