In [195]:
import pandas as pd
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import torch
from torch import nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import  classification_report
from tqdm import tqdm

In [142]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [143]:
train_df.fillna(' ', inplace=True)
test_df.fillna(' ', inplace=True)

In [144]:
train_df['text'] = train_df['author'] + ' ' + train_df['title'] + ' ' + train_df['text']
test_df['text'] = test_df['author'] + ' ' + test_df['title'] + ' ' + test_df['text']

In [145]:
import nltk
nltk.download('wordnet')
porter_stemmer = PorterStemmer()  

def preprocess_text(text):
    if pd.isna(text):
        return []
    text = re.sub('[^a-zA-Z]', ' ', text).lower().split()
    text = [porter_stemmer.stem(word) for word in text if word not in stopwords.words('english')]
    return text

train_df["text"] = train_df["text"].apply(preprocess_text)
test_df["text"] = test_df["text"].apply(preprocess_text)

[nltk_data] Downloading package wordnet to /home/grigorii/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [189]:
y = train_df["label"].values
train_normalized = train_df["text"].values

In [192]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [191]:
word2vec_model = Word2Vec(sentences=train_normalized, vector_size=100, window=5, min_count=2, workers=4)

def text_to_vector(text, model, vector_size=100):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

ready_X = np.array([text_to_vector(text, word2vec_model) for text in train_normalized])

In [152]:
ready_X

array([[ 0.17005411, -0.06975884, -0.97706306, ...,  0.41913012,
        -0.76474208,  0.29716259],
       [-0.01866901, -0.18739831, -0.96270233, ...,  0.21786672,
        -0.65085131,  0.73862445],
       [ 0.12808317, -0.48967269, -0.79332817, ..., -0.20019738,
        -0.3065162 ,  1.13307989],
       ...,
       [-0.13699907, -0.38939902, -0.62628883, ..., -0.05546546,
        -0.40605801,  0.70770413],
       [ 0.09616299, -0.74483943, -0.53934503, ...,  0.22275048,
         0.21181351,  0.72581404],
       [-0.0765762 , -0.17861092, -0.50527352, ..., -0.23413526,
        -0.33993483,  0.88086396]])

In [168]:
from sklearn.linear_model import LogisticRegression

x_train, x_test, y_train, y_test = train_test_split(ready_X, y, test_size=0.2)

model_1 = LogisticRegression()
model_1.fit(x_train, y_train)
pred_1 = model_1.predict(x_test)
print(classification_report(y_test, pred_1))

              precision    recall  f1-score   support

           0       0.91      0.92      0.92      2074
           1       0.92      0.91      0.92      2086

    accuracy                           0.92      4160
   macro avg       0.92      0.92      0.92      4160
weighted avg       0.92      0.92      0.92      4160



In [182]:
X_train, X_val, y_train, y_val = train_test_split(ready_X, y, test_size=0.2)
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
y_pred = gb_clf.predict(X_val)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.88      0.90      0.89      2060
           1       0.90      0.87      0.89      2100

    accuracy                           0.89      4160
   macro avg       0.89      0.89      0.89      4160
weighted avg       0.89      0.89      0.89      4160



In [184]:
from sklearn.ensemble import RandomForestClassifier

model_3 = RandomForestClassifier()
model_3.fit(x_train,y_train)
pred_3 = model_3.predict(x_test)
cr3    = classification_report(y_test,pred_3)
print(cr3)

              precision    recall  f1-score   support

           0       0.53      0.54      0.53      2074
           1       0.53      0.51      0.52      2086

    accuracy                           0.53      4160
   macro avg       0.53      0.53      0.53      4160
weighted avg       0.53      0.53      0.53      4160



In [183]:
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate=0):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x.unsqueeze(1))
        x = self.fc(lstm_out[:, -1, :])
        return x

In [179]:
train_dataset = NewsDataset(X_train, y_train)
val_dataset = NewsDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
model = LSTMClassifier(input_size=100, hidden_size=100, num_layers=2, output_size=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [181]:
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in tqdm(train_loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    y_preds, y_true = [], []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            y_preds.extend(predicted.cpu().numpy())
            y_true.extend(y_batch.numpy())
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(classification_report(y_true, y_preds))


100%|██████████| 1040/1040 [00:04<00:00, 238.82it/s]


Epoch 1/15
LSTM Neural Network Accuracy: 0.8903846153846153
              precision    recall  f1-score   support

           0       0.93      0.85      0.89      2079
           1       0.86      0.94      0.90      2081

    accuracy                           0.89      4160
   macro avg       0.89      0.89      0.89      4160
weighted avg       0.89      0.89      0.89      4160



100%|██████████| 1040/1040 [00:03<00:00, 302.69it/s]


Epoch 2/15
LSTM Neural Network Accuracy: 0.901201923076923
              precision    recall  f1-score   support

           0       0.93      0.87      0.90      2079
           1       0.88      0.93      0.90      2081

    accuracy                           0.90      4160
   macro avg       0.90      0.90      0.90      4160
weighted avg       0.90      0.90      0.90      4160



100%|██████████| 1040/1040 [00:03<00:00, 286.87it/s]


Epoch 3/15
LSTM Neural Network Accuracy: 0.9161057692307693
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      2079
           1       0.92      0.92      0.92      2081

    accuracy                           0.92      4160
   macro avg       0.92      0.92      0.92      4160
weighted avg       0.92      0.92      0.92      4160



100%|██████████| 1040/1040 [00:03<00:00, 335.33it/s]


Epoch 4/15
LSTM Neural Network Accuracy: 0.9170673076923077
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      2079
           1       0.95      0.89      0.91      2081

    accuracy                           0.92      4160
   macro avg       0.92      0.92      0.92      4160
weighted avg       0.92      0.92      0.92      4160



100%|██████████| 1040/1040 [00:03<00:00, 262.62it/s]


Epoch 5/15
LSTM Neural Network Accuracy: 0.9132211538461539
              precision    recall  f1-score   support

           0       0.95      0.88      0.91      2079
           1       0.88      0.95      0.92      2081

    accuracy                           0.91      4160
   macro avg       0.92      0.91      0.91      4160
weighted avg       0.92      0.91      0.91      4160



100%|██████████| 1040/1040 [00:04<00:00, 254.94it/s]


Epoch 6/15
LSTM Neural Network Accuracy: 0.9264423076923077
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      2079
           1       0.93      0.92      0.93      2081

    accuracy                           0.93      4160
   macro avg       0.93      0.93      0.93      4160
weighted avg       0.93      0.93      0.93      4160



100%|██████████| 1040/1040 [00:04<00:00, 240.47it/s]


Epoch 7/15
LSTM Neural Network Accuracy: 0.9235576923076924
              precision    recall  f1-score   support

           0       0.92      0.93      0.92      2079
           1       0.93      0.92      0.92      2081

    accuracy                           0.92      4160
   macro avg       0.92      0.92      0.92      4160
weighted avg       0.92      0.92      0.92      4160



100%|██████████| 1040/1040 [00:04<00:00, 236.02it/s]


Epoch 8/15
LSTM Neural Network Accuracy: 0.9225961538461539
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      2079
           1       0.96      0.88      0.92      2081

    accuracy                           0.92      4160
   macro avg       0.93      0.92      0.92      4160
weighted avg       0.93      0.92      0.92      4160



100%|██████████| 1040/1040 [00:04<00:00, 239.78it/s]


Epoch 9/15
LSTM Neural Network Accuracy: 0.9283653846153846
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      2079
           1       0.94      0.92      0.93      2081

    accuracy                           0.93      4160
   macro avg       0.93      0.93      0.93      4160
weighted avg       0.93      0.93      0.93      4160



100%|██████████| 1040/1040 [00:04<00:00, 230.31it/s]


Epoch 10/15
LSTM Neural Network Accuracy: 0.9319711538461538
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      2079
           1       0.93      0.94      0.93      2081

    accuracy                           0.93      4160
   macro avg       0.93      0.93      0.93      4160
weighted avg       0.93      0.93      0.93      4160



100%|██████████| 1040/1040 [00:04<00:00, 229.35it/s]


Epoch 11/15
LSTM Neural Network Accuracy: 0.9199519230769231
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      2079
           1       0.95      0.88      0.92      2081

    accuracy                           0.92      4160
   macro avg       0.92      0.92      0.92      4160
weighted avg       0.92      0.92      0.92      4160



100%|██████████| 1040/1040 [00:04<00:00, 236.43it/s]


Epoch 12/15
LSTM Neural Network Accuracy: 0.9293269230769231
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      2079
           1       0.94      0.92      0.93      2081

    accuracy                           0.93      4160
   macro avg       0.93      0.93      0.93      4160
weighted avg       0.93      0.93      0.93      4160



100%|██████████| 1040/1040 [00:04<00:00, 231.45it/s]


Epoch 13/15
LSTM Neural Network Accuracy: 0.9317307692307693
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      2079
           1       0.95      0.91      0.93      2081

    accuracy                           0.93      4160
   macro avg       0.93      0.93      0.93      4160
weighted avg       0.93      0.93      0.93      4160



100%|██████████| 1040/1040 [00:03<00:00, 335.53it/s]


Epoch 14/15
LSTM Neural Network Accuracy: 0.9317307692307693
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      2079
           1       0.95      0.92      0.93      2081

    accuracy                           0.93      4160
   macro avg       0.93      0.93      0.93      4160
weighted avg       0.93      0.93      0.93      4160



100%|██████████| 1040/1040 [00:04<00:00, 233.04it/s]


Epoch 15/15
LSTM Neural Network Accuracy: 0.9310096153846154
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      2079
           1       0.92      0.95      0.93      2081

    accuracy                           0.93      4160
   macro avg       0.93      0.93      0.93      4160
weighted avg       0.93      0.93      0.93      4160

