In [None]:
!pip install kaggle
from google.colab import files
files.upload()  

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c fake-news
!unzip fake-news.zip




Saving kaggle.json to kaggle.json
Downloading fake-news.zip to /content
 54% 25.0M/46.5M [00:00<00:00, 71.9MB/s]
100% 46.5M/46.5M [00:00<00:00, 100MB/s] 
Archive:  fake-news.zip
  inflating: submit.csv              
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import pandas as pd
import torch.nn.functional as F
import re
import nltk
from nltk.corpus import stopwords
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, f1_score

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Load data
train_data = pd.read_csv('train.csv')
# Remove rows with missing values




train_data.head()


cuda


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
train_data = train_data[['id','text','label']]
train_data = train_data.dropna(subset=['text', 'label'])
train_data = train_data.set_index('id')

train_data.head()


Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [None]:

nltk.download('stopwords')
def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
  text = re.sub(r'\s+', ' ', text)
  return text
def remove_stopwards(text):
  stopword = set(stopwords.words('english'))
  words = text.split()
  filtered_words = [word for word in words if word.lower() not in stopword]
  return ' '.join(filtered_words)
train_data['text'] = train_data['text'].apply(clean_text)
train_data['text'] = train_data['text'].apply(remove_stopwards)
train_data.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,house dem aide didnt even see comeys letter ja...,1
1,ever get feeling life circles roundabout rathe...,0
2,truth might get fired october 29 2016 tension ...,1
3,videos 15 civilians killed single us airstrike...,1
4,print iranian woman sentenced six years prison...,1


In [5]:
class FakeNewsDataset(Dataset):
    def __init__(self, data_list, vocab):
        self.data_list = data_list
        self.vocab = vocab

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        label, text = self.data_list[idx]
        text_indices = [self.vocab.get(word, self.vocab['<unk>']) for word in text.split()]
        return torch.tensor(label, dtype=torch.float32), text_indices


train_data_list = [(row['label'], row['text']) for _, row in train_data.iterrows()]


In [6]:
test_data = pd.read_csv('test.csv')
test_data = test_data[['id','text']]
test_data = test_data.fillna('')
test_data['text'] = test_data['text'].apply(clean_text)
test_data['text'] = test_data['text'].apply(remove_stopwards)
test_data['label'] = 0

In [7]:
all_text = " ".join(train_data['text']) + " " + " ".join(test_data['text'])
vocab = {word: idx for idx, word in enumerate(set(all_text.split()))}
vocab['<unk>'] = len(vocab)
vocab['<pad>'] = len(vocab)

print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 211067


In [None]:


train_dataset = FakeNewsDataset(train_data_list, vocab)

def collate_batch(batch,vocab):
    labels, texts = zip(*batch)
    texts = [torch.tensor(text, dtype=torch.long) for text in texts]
    texts = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=vocab['<pad>'])
    labels = torch.tensor(labels, dtype=torch.float32)
    return texts, labels



train_size = int(0.7 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

train_dataloader = DataLoader(train_subset, batch_size=12, shuffle=True, collate_fn=lambda x: collate_batch(x, vocab), drop_last=True)
val_dataloader = DataLoader(val_subset, batch_size=12, shuffle=False, collate_fn=lambda x: collate_batch(x, vocab), drop_last=True)



In [None]:

class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout_prob=0.3):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)  # Multiply by 2 due to bidirectional
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, (hidden, _) = self.lstm(embedded)
        hidden_states = torch.cat((hidden[-2], hidden[-1]), dim=1)
        normed_output = self.layer_norm(hidden_states)
        dropped_output = self.dropout(normed_output)
        output = self.fc(dropped_output)
        return output


vocab_size = len(vocab)  
embed_dim = 256
hidden_dim = 128
output_dim = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentimentLSTM(vocab_size, embed_dim, hidden_dim, output_dim, dropout_prob=0.3).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)


# Training function
def train_model(model, dataloader, criterion, optimizer, device, max_grad_norm=1.0):
    model.train()
    total_loss = 0
    for texts, labels in dataloader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


# Evaluation function
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    preds, labels_list = [], []

    with torch.no_grad():
        for texts, labels in dataloader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts).squeeze(1)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds += torch.round(torch.sigmoid(outputs)).cpu().tolist()
            labels_list += labels.cpu().tolist()

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(labels_list, preds)
    f1 = f1_score(labels_list, preds)
    return avg_loss, accuracy, f1








In [10]:
epochs = 8
best_val_loss = float('inf')
for epoch in range(1, epochs + 1):
    # Train and evaluate
    train_loss = train_model(model, train_dataloader, criterion, optimizer, device)
    val_loss, val_accuracy, val_f1  = evaluate_model(model, val_dataloader, criterion, device)
    scheduler.step(val_loss)
    print(f"Epoch {epoch}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")
        print("Best model saved!")
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Current Learning Rate: {current_lr:.6f}")



Epoch 1/8, Train Loss: 0.3609, Val Loss: 0.3381, Val Accuracy: 0.8311, Val F1: 0.8505
Best model saved!
Current Learning Rate: 0.005000
Epoch 2/8, Train Loss: 0.1664, Val Loss: 0.0994, Val Accuracy: 0.9671, Val F1: 0.9669
Best model saved!
Current Learning Rate: 0.005000
Epoch 3/8, Train Loss: 0.1077, Val Loss: 0.0977, Val Accuracy: 0.9663, Val F1: 0.9666
Best model saved!
Current Learning Rate: 0.005000
Epoch 4/8, Train Loss: 0.0828, Val Loss: 0.1438, Val Accuracy: 0.9637, Val F1: 0.9626
Current Learning Rate: 0.005000
Epoch 5/8, Train Loss: 0.0758, Val Loss: 0.0687, Val Accuracy: 0.9759, Val F1: 0.9759
Best model saved!
Current Learning Rate: 0.005000
Epoch 6/8, Train Loss: 0.0611, Val Loss: 0.0937, Val Accuracy: 0.9705, Val F1: 0.9699
Current Learning Rate: 0.005000
Epoch 7/8, Train Loss: 0.0581, Val Loss: 0.0619, Val Accuracy: 0.9785, Val F1: 0.9783
Best model saved!
Current Learning Rate: 0.005000
Epoch 8/8, Train Loss: 0.0511, Val Loss: 0.1217, Val Accuracy: 0.9653, Val F1: 0.964

In [11]:
# Load the best model's state dict
state_dict = torch.load("best_model.pt", weights_only=True)  # Explicitly set weights_only=True
model.load_state_dict(state_dict)
print("Best model loaded.")


Best model loaded.


In [12]:

test_data_list = [(row['label'],row['text']) for _, row in test_data.iterrows()]
test_dataset = FakeNewsDataset(test_data_list, vocab)
test_dataloader = DataLoader(test_dataset, batch_size=12, shuffle=False, collate_fn=lambda x: collate_batch(x, vocab), drop_last=False)

In [13]:
model.eval()
preds = []
with torch.no_grad():
    for texts, _ in test_dataloader:
        texts = texts.to(device)
        outputs = model(texts).squeeze(1)
        predicted_labels = torch.round(torch.sigmoid(outputs)).cpu().tolist()
        preds += predicted_labels

test_data['predictions'] = preds

test_data.head()

Unnamed: 0,id,text,label,predictions
0,20800,palo alto calif years scorning political proce...,0,0.0
1,20801,russian warships ready strike terrorists near ...,0,1.0
2,20802,videos nodapl native american leaders vow stay...,0,1.0
3,20803,first dont succeed try different sport tim teb...,0,0.0
4,20804,42 mins ago 1 views 0 comments 0 likes first t...,0,1.0


In [14]:
len(test_data)

5200

In [None]:
test_data.tail()

Unnamed: 0,id,text,label,predictions
5195,25995,dysfunctions plague worlds megacities none may...,0,0.0
5196,25996,washington gov john kasich ohio tuesday signed...,0,0.0
5197,25997,good morning want get california today email h...,0,1.0
5198,25998,previous next 300 us marines deployed russian ...,0,1.0
5199,25999,perhaps youve seen new tv series whose pilot e...,0,1.0


In [15]:
submit_df = test_data[['id','predictions']]

submit_df.columns = ['id','label']
submit_df['label'] = submit_df['label'].astype(int)

submit_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit_df['label'] = submit_df['label'].astype(int)


Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1


In [16]:
submit = pd.read_csv('submit.csv')
submit.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1


In [17]:
submit_df.to_csv('sbmission8.csv',index=False)
