<a href="https://colab.research.google.com/github/AlexBorealis/netology_nlp_hw/blob/master/kozlov_nlp_hw_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Классификация текстов

## Fakenews

1. Мы будем работать с данными fakenews отсюда: https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv
2. Проведите препроцессинг текста. Разбейте данные на train и test для задачи классификации.
3. Векторизуйте.
4. Обучите на полученных векторах алгоритм классификации.

Мы уже видели как эта задача выполняется с помощью Word2vec. Давайте вспомним.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('hw_1/data/Constraint_Train.csv')

In [None]:
df.head(10)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real
5,6,"Covid Act Now found ""on average each person in...",real
6,7,If you tested positive for #COVID19 and have n...,real
7,8,Obama Calls Trump’s Coronavirus Response A Cha...,fake
8,9,"???Clearly, the Obama administration did not l...",fake
9,10,Retraction—Hydroxychloroquine or chloroquine w...,fake


In [None]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:01<00:00, 3468.79it/s]


In [None]:
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText

In [None]:
vector_size = 250
%time model_tweets = Word2Vec(sentences, workers=8, vector_size=vector_size, min_count=5, window=10, sg=1, epochs=20)

CPU times: total: 1min 19s
Wall time: 16 s


In [None]:
%time model_tweets_ft = FastText(sentences, workers=8, vector_size=vector_size, min_count=5, window=10, sg=1, epochs=10)

CPU times: total: 1min 25s
Wall time: 23.1 s


In [None]:
model_tweets.wv.most_similar('obama')

[('barack', 0.8560135960578918),
 ('harris', 0.6738513708114624),
 ('nancy', 0.637886643409729),
 ('speaker', 0.6353716850280762),
 ('funded', 0.622643768787384),
 ('rodrigo', 0.6172960996627808),
 ('pelosi', 0.6161788702011108),
 ('downplayed', 0.6131364107131958),
 ('melinda', 0.6025975942611694),
 ('knew', 0.5987945199012756)]

In [None]:
model_tweets_ft.wv.most_similar('obama')

[('barack', 0.9069386124610901),
 ('pelosi', 0.8519723415374756),
 ('downplayed', 0.8374764323234558),
 ('biological', 0.8362008333206177),
 ('2017', 0.8270395994186401),
 ('debt', 0.8146917223930359),
 ('xi', 0.8114641308784485),
 ('joe', 0.8105072379112244),
 ('nancy', 0.81026691198349),
 ('hoax', 0.8028159737586975)]

In [None]:
import numpy as np

In [None]:
def get_text_embedding(text, model=None):
    result = []
    for word in word_tokenize(text.lower()):
        if word in model.wv:
            result.append(model.wv[word])

    if len(result):
        result = np.sum(result, axis=0)
    else:
        result = np.zeros(vector_size)
    return result

In [None]:
features_w2v = [get_text_embedding(text, model=model_tweets) for text in tqdm(df.tweet)]
features_ft = [get_text_embedding(text, model=model_tweets_ft) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:02<00:00, 2167.34it/s]
100%|██████████| 6420/6420 [00:04<00:00, 1332.20it/s]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features_w2v, df.label, test_size=1/4, random_state=42)
model = LogisticRegression(max_iter=2000, tol=1e-6)
model.fit(X_train, y_train)

In [None]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.93      0.93      0.93       768
        real       0.94      0.93      0.93       837

    accuracy                           0.93      1605
   macro avg       0.93      0.93      0.93      1605
weighted avg       0.93      0.93      0.93      1605



In [None]:
X_train, X_test, y_train, y_test = train_test_split(features_ft, df.label, test_size=1/4, random_state=42)
model = LogisticRegression(max_iter=2000, tol=1e-6)
model.fit(X_train, y_train)

In [None]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.92      0.93      0.93       768
        real       0.94      0.92      0.93       837

    accuracy                           0.93      1605
   macro avg       0.93      0.93      0.93      1605
weighted avg       0.93      0.93      0.93      1605



###  Что будет, если использовать самый наивный метод?

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec = CountVectorizer()

In [None]:
bow = vec.fit_transform(df.tweet)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow, df.label, test_size=1/4, random_state=42)
model = LogisticRegression(max_iter=2000, tol=1e-6)
model.fit(X_train, y_train)

In [None]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.91      0.92      0.92       768
        real       0.93      0.92      0.92       837

    accuracy                           0.92      1605
   macro avg       0.92      0.92      0.92      1605
weighted avg       0.92      0.92      0.92      1605



Конечно, мы всегда можем поиграться с предобработкой.

### PyTorch + LSTM

In [None]:
labels = (df.label == 'real').astype(int).to_list()

Нужно заранее задать размер для макксимальной длины предложений.

In [None]:
token_lists = [word_tokenize(text.lower()) for text in df.tweet]
max_len = len(max(token_lists, key=len))

In [None]:
max_len

1592

Это слишком много. Но какая длина обычно?

In [None]:
from collections import Counter
fd = Counter([len(tokens) for tokens in token_lists])

In [None]:
fd.most_common(10)

[(20, 178),
 (25, 174),
 (22, 170),
 (18, 170),
 (19, 168),
 (21, 168),
 (16, 163),
 (17, 162),
 (15, 160),
 (23, 156)]

Зададим максимум 200.

Возьмём те же w2v эмбеддинги.

In [None]:
def get_word_embedding(tokens, model=None, max_len=None):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model.wv:
                result.append(model.wv[word])
            else:
                result.append(np.zeros(vector_size))
        else:
            result.append(np.zeros(vector_size))
    return result

In [None]:
features_w2v = [get_word_embedding(text, model=model_tweets, max_len=200) for text in tqdm(token_lists)]
features_ft = [get_word_embedding(text, model=model_tweets_ft, max_len=200) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:03<00:00, 2044.01it/s]
100%|██████████| 6420/6420 [00:05<00:00, 1253.21it/s]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(features_w2v, labels, test_size=1/4)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
len(features_w2v[0][0])

250

In [None]:
len(X_train)

4815

In [None]:
len(X_train[0])

200

In [None]:
len(X_train[0][0])

250

In [None]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(vector_size, 100)
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net = Net()
print(net)

Net(
  (lstm): LSTM(250, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [None]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

  in_data = torch.tensor(X_train).float()


In [None]:
in_data.shape

torch.Size([4815, 200, 250])

In [None]:
optimizer = optim.SGD(net.parameters(), lr=.001, momentum=.9, nesterov=True)
criterion = nn.BCELoss()

In [None]:
def train_lstm(in_data, targets, epoch=10, batch_size=16):
    for e in range(epoch):
        for i in tqdm(range(0, in_data.shape[0], batch_size)):
            batch_x = in_data[i:i + batch_size]
            batch_y = targets[i:i + batch_size]
            optimizer.zero_grad()
            output = net(batch_x)
            loss = criterion(output.reshape(-1), batch_y)
            loss.backward()
            optimizer.step()
        if e % 2 == 0:
            print(loss)

In [None]:
train_lstm(in_data, targets, batch_size=8)

100%|██████████| 602/602 [03:15<00:00,  3.08it/s]


tensor(0.6835, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 602/602 [02:56<00:00,  3.41it/s]
100%|██████████| 602/602 [03:01<00:00,  3.32it/s]


tensor(0.6827, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 602/602 [03:27<00:00,  2.90it/s]
100%|██████████| 602/602 [03:25<00:00,  2.94it/s]


tensor(0.6826, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 602/602 [03:24<00:00,  2.94it/s]
100%|██████████| 602/602 [03:24<00:00,  2.94it/s]


tensor(0.6826, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 602/602 [03:33<00:00,  2.82it/s]
100%|██████████| 602/602 [03:30<00:00,  2.87it/s]


tensor(0.6825, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 602/602 [03:29<00:00,  2.87it/s]


Что получилось?

In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
round(
    result.sum().item() / len(result), 3
)

0.549