In [1336]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [1337]:
X_train = pd.read_csv(r'C:\Users\abiab\Downloads\train.csv.zip')
X_test = pd.read_csv(r'C:\Users\abiab\Downloads\test.csv.zip')

In [1340]:
df = X_train.drop(['id', 'author', 'text'], axis = 1)

In [1341]:
dataframe = pd.DataFrame(df)

In [1342]:
dataframe = dataframe.dropna()

In [1343]:
dataframe.reset_index(drop = True, inplace = True)

In [1346]:
dataframe['length'] = dataframe['title'].apply(len)

In [1347]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
from tqdm import tqdm

tqdm.pandas()
Lemma = WordNetLemmatizer()
Messages = dataframe['title']
stopwords = stopwords.words('english')
def text_preprocess(message):
    no_punc = [char for char in message if char not in string.punctuation]
    no_punc = ''.join(no_punc)
    words = re.sub('[^a-zA-Z]', ' ', no_punc)
    return [Lemma.lemmatize(word.lower()) for word in words.split() if word not in stopwords]

In [1282]:
def text_to_indices(vocab, text):
    text = text.split()
    numerical_sentence = []
    for token in text:
        if token in vocab:
            numerical_sentence.append(vocab[token])
        else:
            numerical_sentence.append(vocab['UNK'])
    return numerical_sentence

In [1349]:
training_seq = []
for seq in Messages:
    training_seq.append(text_to_indices(vocab, seq.lower()))
    
max_len=max(len(x) for x in training_seq)

In [1352]:
import torch
from torch.utils.data import Dataset, DataLoader

class dataset(Dataset):
    def __init__(self, df, vocab, label):
        self.df = df
        self.vocab = vocab
        self.label = label
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self,index):
        numerical_question = text_to_indices(self.vocab, self.df.iloc[index].lower())
        return torch.tensor(numerical_question), torch.tensor(self.label[index])

In [1353]:
data = dataset(Messages, vocab, df['label'])

In [1354]:
from torch.nn.utils.rnn import pad_sequence

def collate_fun(batch):
    titles, labels = zip(*batch)
    titles_padded = pad_sequence(titles, batch_first=True)
    labels = torch.stack(labels)
    return titles_padded, labels

In [1355]:
dataloader = DataLoader(data, shuffle = True, batch_size = 32, collate_fn = collate_fun)

In [1356]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, embed_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first = True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, title):
        embed = self.embedding(title)     # [batch, seq_len, embed_dim] 
        hidden, output = self.rnn(embed)    # output : [1, batch, hidden_dim]
        final_output = output.squeeze(0)    # [batch, hidden_dim]
        output = self.fc(final_output)       # [batch, 1]
        return output.squeeze(1)  # [batch]


model = RNN(len(vocab), 64, 50)

In [1357]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [1358]:
if torch.cuda.is_available():
    device = torch.device(device='cuda')
else:
    device = torch.device(device='cpu')

In [None]:
plt.plot(train_losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [1298]:
def predict(model, title, threshold=0.5):
    num_question = text_to_indices(vocab, title)
    print("Token indices:", num_question)

    if all(tok == vocab['UNK'] for tok in num_question):
        print("All tokens unknown.")

    question_tensor = torch.tensor(num_question).unsqueeze(0)

    with torch.no_grad():
        output = model(question_tensor)
        print(output[:10])
        prob = torch.sigmoid(output).item()
        print(f"Predicted probability of Real: {prob:.4f}")

    if prob < threshold:
        print("Prediction: Fake")
    else:
        print("Prediction: Real")

In [None]:
str = "Prize win script".lower()
predict(model, str)

In [None]:
n_correct = 0
n_samples = 0
with torch.no_grad():
    for title, labels in dataloader:
        title = title.to(device)
        labels = labels.to(device)

        output = model(title)  
        prob = torch.sigmoid(output)
        preds = (probs >= 0.5)
        
        n_samples += labels.shape[0]
        n_correct += (preds == labels).sum().item()
    accuracy = 100 * n_correct / n_samples
    print("accuracy : ", accuracy)
print(n_correct)
print(n_samples)

In [None]:
predict(model, "You won a prize lottery")

In [1322]:
from sklearn.metrics import classification_report, accuracy_score

all_preds = []
all_labels = []

with torch.no_grad():
    for title, labels in dataloader:
        title = title.to(device)
        labels = labels.to(device)

        outputs = model(title)

        probs = torch.sigmoid(outputs)
        predicted = (probs > 0.5).float()
        predicted = predicted.view_as(labels)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Now use sklearn metrics
accuracy = accuracy_score(all_labels, all_preds)
print(f"✅ Accuracy: {accuracy * 100:.2f}%")

report = classification_report(all_labels, all_preds, target_names=["Fake", "Real"])
print(report)

✅ Accuracy: 74.59%
              precision    recall  f1-score   support

        Fake       1.00      0.51      0.67     10387
        Real       0.66      1.00      0.79      9855

    accuracy                           0.75     20242
   macro avg       0.83      0.75      0.73     20242
weighted avg       0.83      0.75      0.73     20242

