In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset,DataLoader
from collections import Counter
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
data_fake=pd.read_csv("C:\Projects\Main\Fake_News\data\Fake.csv")
data_true=pd.read_csv("C:\Projects\Main\Fake_News\data\True.csv")

In [3]:
data_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
data_true.tail()

Unnamed: 0,title,text,subject,date
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017"


In [4]:
data_fake["class"]=0
data_true["class"]=1

In [5]:
data=pd.concat([data_fake,data_true],axis=0)
data.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [6]:
data=data.drop(["date","title","subject"],axis=1)
data.head()

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [7]:
data.isnull().sum().sum()

0

In [8]:
data=data.sample(frac=1)
data.columns

Index(['text', 'class'], dtype='object')

In [9]:
data.reset_index(inplace=True)
data.drop(['index'],axis=1,inplace=True)
data.head()

Unnamed: 0,text,class
0,Line of the night from Cohen: These accusation...,0
1,Danish police are looking for a gang who beat ...,0
2,BRUSSELS (Reuters) - The president of the Euro...,1
3,Hey Barack Hussein Obama America is still wait...,0
4,(This version of September 20 s story correct...,1


In [10]:
def preprocess(text):
    text=text.lower()
    text = re.sub('\[.*?\]','',text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+','',text)
    text = re.sub('<.*?>+',b'',text)
    text = re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text = re.sub('\w*\d\w*','',text)
    return text

In [11]:
data['text'] = data['text'].apply(preprocess)

In [12]:
X=data["text"]
y=data["class"]

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [14]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

Tokenize->Vocab->Dataset->Model->Train->Inference

In [15]:
def tokenize_text(text):
    text=re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return text.split()

In [16]:
tokenized_X_train = X_train.apply(tokenize_text)
tokenized_X_test = X_test.apply(tokenize_text)

In [17]:
def build_vocab(tokenize_text,max_vocab):
    word_counts=Counter(word for sentence in tokenize_text for word in sentence)
    most_common_words=word_counts.most_common(max_vocab-2)
    vocab={word:i+2 for i,(word,_) in enumerate(most_common_words)}
    vocab["<PAD>"]=0
    vocab["<UNK>"]=1

    return vocab


In [18]:
max_vocab=10000
vocab = build_vocab(tokenized_X_train, max_vocab)

In [20]:
import json
with open("vocab.json", "w") as f:
    json.dump(vocab, f)

In [20]:
class FakeNewsDataset(Dataset):
    def __init__(self,tokenized_texts,labels,vocab,max_len):
        self.texts = tokenized_texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self,idx):    
        text = self.encode_text(self.texts[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return text, label

    def encode_text(self,text):
        encoded = [self.vocab.get(word, self.vocab["<UNK>"]) for word in text]
        padded = encoded[:self.max_len] + [self.vocab["<PAD>"]] * max(0, self.max_len - len(encoded))
        return torch.tensor(padded, dtype=torch.long)

In [21]:
max_sequence_length = 200
train_dataset = FakeNewsDataset(tokenized_X_train, y_train.tolist(), vocab, max_sequence_length)
test_dataset = FakeNewsDataset(tokenized_X_test, y_test.tolist(), vocab, max_sequence_length)

In [22]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [23]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out[:, -1, :])  # Use the last hidden state
        output = self.fc(lstm_out)
        return output

In [24]:
embedding_dim = 100
hidden_dim = 128
output_dim = 2  # Binary classification

In [25]:
model = LSTMClassifier(len(vocab), embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [26]:
num_epochs = 5
def train_model():
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0

        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}')

train_model()

Epoch 1/5, Loss: 343.5405
Epoch 2/5, Loss: 85.2981
Epoch 3/5, Loss: 11.0873
Epoch 4/5, Loss: 5.8042
Epoch 5/5, Loss: 5.4306


In [29]:
def evaluate_model():
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Accuracy: {accuracy:.4f}')

evaluate_model()

Accuracy: 0.9984


In [30]:
torch.save(model.state_dict(), "model.pth")