In [None]:
!pip install opendatasets
!pip install pandas
!pip install kaggle

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
import opendatasets as od
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
od.download("https://www.kaggle.com/datasets/jillanisofttech/fake-or-real-news")
nltk.download('stopwords')

Downloading fake-or-real-news.zip to ./fake-or-real-news


100%|██████████| 11.5M/11.5M [00:01<00:00, 6.32MB/s]





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import numpy as np
import pandas as pd
import re
import string
from collections import Counter
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Data cleaning function
def clean_text(text):
    text = re.sub('\[[^]]*\]', '', text)
    text = re.sub(r'http\S+', '', text)
    stop_words = set(stopwords.words('english'))
    punctuation = list(string.punctuation)
    stop_words.update(punctuation)
    text = " ".join(word for word in text.split() if word.lower() not in stop_words)
    return text

# Load and clean data
data_path = '/content/fake-or-real-news/fake_or_real_news.csv'
data = pd.read_csv(data_path)
data['text'] = data['text'].apply(clean_text)

# Build vocabulary
def build_vocab(texts, num_words=5000):
    word_count = Counter(word for text in texts for word in text.split())
    most_common = word_count.most_common(num_words - 1)
    word_index = {word: i + 1 for i, (word, _) in enumerate(most_common)}
    word_index['<UNK>'] = 0
    return word_index

vocab = build_vocab(data['text'])

# Tokenization and padding
def text_to_sequences(texts, vocab, maxlen):
    sequences = []
    for text in texts:
        # Convert words to indices, using <UNK> for unknown words not in vocab
        seq = [vocab.get(word, vocab['<UNK>']) for word in text.split()]
        # Adjust sequence length:
        # If sequence is shorter than maxlen, pad it
        # If sequence is longer than maxlen, truncate it
        if len(seq) < maxlen:
            seq = [0] * (maxlen - len(seq)) + seq  # Prepend padding
        else:
            seq = seq[-maxlen:]  # Truncate to the last 'maxlen' elements
        sequences.append(seq)
    return np.array(sequences)

X = text_to_sequences(data['text'], vocab, maxlen=500)

# Label encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])
y = y.reshape(-1, 1)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_data = TextDataset(X_train, y_train)
val_data = TextDataset(X_val, y_val)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

# Model definition
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Changed: No dropout specified here if only one layer is used
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)  # Manual dropout after LSTM
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x = self.dropout(x[:, -1, :])  # Apply dropout manually
        x = self.fc(x)
        return torch.sigmoid(x)

# Model parameters
vocab_size = 5000
embedding_dim = 400
hidden_dim = 256
output_dim = 1

# Create the model
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)




Using device: cuda


In [None]:
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=10):
    model.train()
    for epoch in range(epochs):
        for inputs, targets in train_loader:
            # Move data to the device
            inputs = inputs.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, targets.squeeze())
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch + 1}, Loss: {loss.item()}')


In [None]:

# Criterion without need for target reshape
criterion = nn.BCELoss()

# Train the model
train_model(model, train_loader, val_loader, optimizer, criterion)

Epoch 1, Loss: 0.13634933531284332
Epoch 2, Loss: 0.0088317496702075
Epoch 3, Loss: 0.0031148872803896666
Epoch 4, Loss: 0.005209406837821007
Epoch 5, Loss: 0.05426201969385147
Epoch 6, Loss: 0.015737349167466164
Epoch 7, Loss: 0.0006198110058903694
Epoch 8, Loss: 0.000591946009080857
Epoch 9, Loss: 0.00031814322574064136
Epoch 10, Loss: 0.0007960774819366634


In [None]:
# Save the entire model
torch.save(model, 'complete_model.pth')

torch.save(model.state_dict(), 'model_state_dict.pth')
