# Сделать классификацию данных fakenews
Используя ноутбук занятия (также размещен в папке Materials) и данные fakenews, 3 раза разными способами получить на задаче классификации значение f1 выше 0.91 для методов на sklearn и выше 0.52 для методов на pytorch.

In [1]:
!wget https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv

--2022-06-05 10:09:30--  https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1253562 (1.2M) [text/plain]
Saving to: ‘Constraint_Train.csv’


2022-06-05 10:09:30 (123 MB/s) - ‘Constraint_Train.csv’ saved [1253562/1253562]



In [2]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import nltk
nltk.download('punkt')
from gensim.models.word2vec import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
df = pd.read_csv('Constraint_Train.csv')
df.head(10)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real
5,6,"Covid Act Now found ""on average each person in...",real
6,7,If you tested positive for #COVID19 and have n...,real
7,8,Obama Calls Trump’s Coronavirus Response A Cha...,fake
8,9,"???Clearly, the Obama administration did not l...",fake
9,10,Retraction—Hydroxychloroquine or chloroquine w...,fake


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6420 entries, 0 to 6419
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6420 non-null   int64 
 1   tweet   6420 non-null   object
 2   label   6420 non-null   object
dtypes: int64(1), object(2)
memory usage: 150.6+ KB


In [5]:
sentences = [word_tokenize(text.lower()) for text in tqdm(df['tweet'])]

100%|██████████| 6420/6420 [00:01<00:00, 3575.25it/s]


In [6]:
model_tweets = Word2Vec(sentences, workers=4, size=300, min_count=3, window=5, iter=15)

In [7]:
model_tweets.init_sims()

In [8]:
def get_text_embedding(text):
  result = []
  for word in word_tokenize(text.lower()):
    if word in model_tweets.wv:
      result.append(model_tweets.wv[word])
  if len(result):
    result = np.sum(result, axis = 0)
  else:
    result = np.zeros(300)
  return result

In [9]:
features = [get_text_embedding(text) for text in tqdm(df['tweet'])]

100%|██████████| 6420/6420 [00:05<00:00, 1176.97it/s]


### Support Vector Machines

In [10]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [11]:
X_train, X_test, y_train, y_test = train_test_split(features, df['label'], test_size = 0.3)

In [12]:
svm_model = SVC(kernel='linear').fit(X_train, y_train)
prediction_svm = svm_model.predict(X_test)
print(classification_report(y_test, prediction_svm))

              precision    recall  f1-score   support

        fake       0.93      0.92      0.92       940
        real       0.92      0.93      0.93       986

    accuracy                           0.92      1926
   macro avg       0.92      0.92      0.92      1926
weighted avg       0.92      0.92      0.92      1926



## Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, bootstrap = True).fit(X_train, y_train)
prediction_classifier = classifier.predict(X_test)
print(classification_report(y_test, prediction_classifier))

              precision    recall  f1-score   support

        fake       0.95      0.91      0.93       940
        real       0.92      0.95      0.93       986

    accuracy                           0.93      1926
   macro avg       0.93      0.93      0.93      1926
weighted avg       0.93      0.93      0.93      1926



## KNN

In [14]:
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
prediction_kn = kn.predict(X_test)
print(classification_report(y_test, prediction_kn))

              precision    recall  f1-score   support

        fake       0.92      0.89      0.90       940
        real       0.90      0.92      0.91       986

    accuracy                           0.91      1926
   macro avg       0.91      0.91      0.91      1926
weighted avg       0.91      0.91      0.91      1926



## Logistic Regression + LDA

In [15]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression

lda = LDA(n_components=1)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

model = LogisticRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.92      0.91      0.91       940
        real       0.91      0.92      0.92       986

    accuracy                           0.92      1926
   macro avg       0.92      0.92      0.92      1926
weighted avg       0.92      0.92      0.92      1926



## PyTorch + LSTM

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim

In [17]:
labels = (df['label'] == 'real').astype(int).to_list()

token_lists = [word_tokenize(text.lower()) for text in df.tweet]
max_len = len(max(token_lists, key=len))

In [18]:
def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    return result

In [19]:
features = [get_word_embedding(text, 200) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:04<00:00, 1300.60it/s]


In [20]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)

In [21]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(300, 100)
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net = Net()
print(net)

Net(
  (lstm): LSTM(300, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [22]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

  """Entry point for launching an IPython kernel.


In [23]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCELoss()

In [28]:
def train_one_epoch(n_epochs, in_data, targets, batch_size=16):
  for epoch in range(0, n_epochs + 1):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(f'epoch - {epoch}, loss {loss}')

In [29]:
train_one_epoch(50, in_data, targets)

100%|██████████| 281/281 [02:44<00:00,  1.71it/s]


epoch - 0, loss 0.6797469854354858


100%|██████████| 281/281 [02:41<00:00,  1.74it/s]


epoch - 1, loss 0.6799895167350769


100%|██████████| 281/281 [02:40<00:00,  1.75it/s]


epoch - 2, loss 0.6800376176834106


100%|██████████| 281/281 [02:40<00:00,  1.75it/s]


epoch - 3, loss 0.6800460815429688


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 4, loss 0.6800464987754822


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 5, loss 0.6800452470779419


100%|██████████| 281/281 [02:41<00:00,  1.74it/s]


epoch - 6, loss 0.6800436973571777


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 7, loss 0.6800421476364136


100%|██████████| 281/281 [02:41<00:00,  1.74it/s]


epoch - 8, loss 0.6800406575202942


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 9, loss 0.6800389885902405


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 10, loss 0.6800374984741211


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 11, loss 0.6800359487533569


100%|██████████| 281/281 [02:44<00:00,  1.71it/s]


epoch - 12, loss 0.6800345182418823


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 13, loss 0.6800330281257629


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 14, loss 0.6800315976142883


100%|██████████| 281/281 [02:40<00:00,  1.75it/s]


epoch - 15, loss 0.6800300478935242


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 16, loss 0.6800286173820496


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 17, loss 0.6800273656845093


100%|██████████| 281/281 [02:41<00:00,  1.74it/s]


epoch - 18, loss 0.6800256967544556


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 19, loss 0.6800245046615601


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 20, loss 0.6800230145454407


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 21, loss 0.6800217032432556


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 22, loss 0.6800203919410706


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 23, loss 0.6800190806388855


100%|██████████| 281/281 [02:43<00:00,  1.71it/s]


epoch - 24, loss 0.6800177693367004


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 25, loss 0.6800163984298706


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 26, loss 0.6800152063369751


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 27, loss 0.6800138354301453


100%|██████████| 281/281 [02:43<00:00,  1.71it/s]


epoch - 28, loss 0.680012583732605


100%|██████████| 281/281 [02:42<00:00,  1.72it/s]


epoch - 29, loss 0.6800113916397095


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 30, loss 0.6800101399421692


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 31, loss 0.6800089478492737


100%|██████████| 281/281 [02:44<00:00,  1.71it/s]


epoch - 32, loss 0.680007815361023


100%|██████████| 281/281 [02:41<00:00,  1.74it/s]


epoch - 33, loss 0.6800065636634827


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 34, loss 0.6800054311752319


100%|██████████| 281/281 [02:44<00:00,  1.71it/s]


epoch - 35, loss 0.6800042390823364


100%|██████████| 281/281 [02:42<00:00,  1.72it/s]


epoch - 36, loss 0.6800032258033752


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 37, loss 0.6800020933151245


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 38, loss 0.6800009608268738


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 39, loss 0.6799999475479126


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 40, loss 0.6799988150596619


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 41, loss 0.6799976229667664


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 42, loss 0.67999666929245


100%|██████████| 281/281 [02:44<00:00,  1.71it/s]


epoch - 43, loss 0.6799956560134888


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 44, loss 0.6799945831298828


100%|██████████| 281/281 [02:43<00:00,  1.72it/s]


epoch - 45, loss 0.6799935698509216


100%|██████████| 281/281 [02:44<00:00,  1.71it/s]


epoch - 46, loss 0.6799925565719604


100%|██████████| 281/281 [02:44<00:00,  1.71it/s]


epoch - 47, loss 0.679991602897644


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 48, loss 0.6799905896186829


100%|██████████| 281/281 [02:42<00:00,  1.73it/s]


epoch - 49, loss 0.6799896955490112


100%|██████████| 281/281 [02:41<00:00,  1.74it/s]

epoch - 50, loss 0.67998868227005





In [30]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [31]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

result = (output > 0.5) == targets_test
result.sum().item() / len(result)

0.5337487019730011