##Загружаем и обрабатываем данные


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Constraint_Train.csv')
df

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real
...,...,...,...
6415,6416,A tiger tested positive for COVID-19 please st...,fake
6416,6417,???Autopsies prove that COVID-19 is??� a blood...,fake
6417,6418,_A post claims a COVID-19 vaccine has already ...,fake
6418,6419,Aamir Khan Donate 250 Cr. In PM Relief Cares Fund,fake


In [3]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:01<00:00, 4180.40it/s]


In [52]:
from gensim.models.word2vec import Word2Vec
model_tweets = Word2Vec(sentences, workers =4, vector_size=300, min_count=3, window=5, epochs =15)

In [53]:
model_tweets.wv.most_similar('woman')

[('man', 0.9323614835739136),
 ('20s', 0.9133509397506714),
 ('boy', 0.897718608379364),
 ('allegedly', 0.8954858779907227),
 ('doctor', 0.8941175937652588),
 ('30s', 0.8937039375305176),
 ('italian', 0.88979172706604),
 ('showing', 0.8879326581954956),
 ('muslim', 0.8864081501960754),
 ('photo', 0.8721577525138855)]

In [54]:
model_tweets.init_sims()

  model_tweets.init_sims()


In [55]:
import numpy as np



In [56]:
def get_text_embedding(text):
    result = []
    for word in word_tokenize(text.lower()):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word])

    if len(result):
        result = np.sum(result, axis=0)
    else:
        result = np.zeros(300)
    return result

In [57]:
features = [get_text_embedding(text) for text in tqdm(df.tweet)]


100%|██████████| 6420/6420 [00:01<00:00, 3345.21it/s]


1. Метод логичтической регрессии. с помощью этого метода мы получаем нужный результат f1

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split



In [60]:
X_train, X_test, y_train, y_test = train_test_split(features, df.label, test_size=0.33)

In [61]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
from sklearn.metrics import classification_report


In [64]:
predicted = model.predict(X_test)

In [65]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.90      0.90      0.90      1014
        real       0.91      0.91      0.91      1105

    accuracy                           0.91      2119
   macro avg       0.90      0.91      0.90      2119
weighted avg       0.91      0.91      0.91      2119



2. Дерево решений. Здесь не удалось достичь нужного результата, даже настраивая глубину. Предположу, что первичная обработка данных от шума могла бы улучшить результат, так как деревья решений чувствительны к шуму в данных

In [99]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, min_samples_leaf = 1, random_state=1 )

In [100]:
tree.fit(X_train, y_train)

In [101]:
pred_tree = tree.predict(X_test)

In [102]:
print(classification_report(y_test, pred_tree))

              precision    recall  f1-score   support

        fake       0.89      0.87      0.88      1014
        real       0.89      0.90      0.90      1105

    accuracy                           0.89      2119
   macro avg       0.89      0.89      0.89      2119
weighted avg       0.89      0.89      0.89      2119



3. Метод ближайших соседей - нужный результат f1-score получен

In [103]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier ()

In [104]:
knn.fit(X_train, y_train)

In [105]:
knn_pred = knn.predict(X_test)

In [106]:
print(classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

        fake       0.92      0.93      0.92      1014
        real       0.93      0.93      0.93      1105

    accuracy                           0.93      2119
   macro avg       0.93      0.93      0.93      2119
weighted avg       0.93      0.93      0.93      2119



4. Метод опорных векторов. Нужное качество модели достигнуто

In [107]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', random_state = 1)

In [108]:
svm.fit(X_train, y_train)

In [109]:
svm_pred = svm.predict(X_test)

In [110]:
print(classification_report(y_test, svm_pred ))

              precision    recall  f1-score   support

        fake       0.90      0.92      0.91      1014
        real       0.92      0.91      0.92      1105

    accuracy                           0.91      2119
   macro avg       0.91      0.91      0.91      2119
weighted avg       0.91      0.91      0.91      2119



##Методы  pytorch

In [130]:
labels = (df.label == 'real').astype(int).to_list()

In [131]:
token_lists = [word_tokenize(text.lower()) for text in df.tweet]
max_len = len(max(token_lists, key=len))

In [132]:
def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    return result

In [133]:
features = [get_word_embedding(text, 200) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:03<00:00, 1827.46it/s]


In [134]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

In [135]:
import torch
import torch.nn as nn
import torch.optim as optim

In [136]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(300, 100)
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net = Net()
print(net)

Net(
  (lstm): LSTM(300, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [137]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [138]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCELoss()

In [139]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [140]:
train_one_epoch(in_data, targets)

100%|██████████| 269/269 [00:13<00:00, 20.28it/s]

tensor(0.7076, grad_fn=<BinaryCrossEntropyBackward0>)





In [141]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [142]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)
result = (output > 0.5) == targets_test
result.sum().item() / len(result)

0.5247758376592733

Получили нужный результат, тренируя сеть на одной эпохе. Теперь попробуем взять один embedding для всего текста,поменять оптимизатор и увеличить количество эпох до 8. Результат значительно лучше

In [149]:
features = [get_text_embedding(text) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:01<00:00, 3364.37it/s]


In [151]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.out = nn.Linear(300, 1)

    def forward(self, x):
        return torch.sigmoid(self.out(x))


net = Net()
print(net)

Net(
  (out): Linear(in_features=300, out_features=1, bias=True)
)


In [153]:
optimizer = optim.Adam(net.parameters(), lr=0.01)
criterion = nn.BCELoss()


In [154]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)

In [155]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [157]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [158]:
for i in range(8):
  train_one_epoch(in_data, targets)

100%|██████████| 301/301 [00:00<00:00, 1926.86it/s]


tensor(0.7690, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1985.64it/s]


tensor(0.9947, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1977.23it/s]


tensor(0.8413, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1961.27it/s]


tensor(1.2207, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1981.18it/s]


tensor(0.6119, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1957.89it/s]


tensor(1.7103, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 301/301 [00:00<00:00, 2036.09it/s]


tensor(1.9494, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 301/301 [00:00<00:00, 2052.57it/s]

tensor(0.8757, grad_fn=<BinaryCrossEntropyBackward0>)





In [160]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [161]:
with torch.no_grad():
    output = net(in_data_test).squeeze(1)
result = (output > 0.5) == targets_test
result.sum().item() / len(result)

0.8934579439252337