<a href="https://colab.research.google.com/github/Amirgh8080/Anlyzer/blob/main/Final_SLFEND.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!mkdir /.kaggle
!mv kaggle.json /.kaggle
!mv /.kaggle /root/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d emineyetm/fake-news-detection-datasets

Dataset URL: https://www.kaggle.com/datasets/emineyetm/fake-news-detection-datasets
License(s): unknown
Downloading fake-news-detection-datasets.zip to /content
 81% 33.0M/41.0M [00:00<00:00, 63.4MB/s]
100% 41.0M/41.0M [00:00<00:00, 68.5MB/s]


In [None]:
!unzip fake-news-detection-datasets.zip

Archive:  fake-news-detection-datasets.zip
  inflating: News _dataset/Fake.csv  
  inflating: News _dataset/True.csv  


In [None]:
import pandas as pd
fake = '/content/News _dataset/Fake.csv'
true = '/content/News _dataset/True.csv'
fake_df = pd.read_csv(fake)
true_df = pd.read_csv(true)

In [None]:
true_df['label'] = 1
fake_df['label'] = 0
pd.concat([true_df, fake_df]).info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# --- Data Loading and Preprocessing ---
def load_csv(file_path):
    """Load dataset CSV."""
    df = pd.read_csv(file_path)
    df = df.rename(columns={'text':'Text'})
    return df

def preprocess_data(df, text_column, label_column):
    """Preprocess data, converting text and labels to lists."""
    df = df.dropna(subset=[text_column, label_column])
    df[label_column] = df[label_column].apply(lambda x: 1 if x == 'Real' else 0)
    texts = df[text_column].tolist()
    labels = df[label_column].tolist()
    return texts, labels

class NewsDataset(Dataset):
    """Custom Dataset class for news data."""
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# --- Model Architecture ---
class BertEmbedding(nn.Module):
    """BERT encoder to extract textual embeddings."""
    def __init__(self, bert_model_name='bert-base-uncased'):
        super(BertEmbedding, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # Return hidden states [batch_size, seq_len, hidden_size]

class LeapGRU(nn.Module):
    """Leap GRU module for skipping irrelevant words."""
    def __init__(self, input_size, hidden_size):
        super(LeapGRU, self).__init__()
        self.gru = nn.GRUCell(input_size, hidden_size)
        self.hidden_size = hidden_size
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size * 2 + input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 2),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        h = torch.zeros(batch_size, self.hidden_size).to(x.device)
        outputs = []

        for t in range(seq_len):
            ht = self.gru(x[:, t, :], h)
            context = torch.cat([h, ht, x[:, t, :]], dim=-1)
            skip_prob = self.mlp(context)[:, 1]  # Skip probability based on context

            if skip_prob.mean() >= 0.5:
                h = ht  # Update hidden state only if skip_prob is high

            outputs.append(h)

        outputs = torch.stack(outputs, dim=1)
        return outputs  # [batch_size, seq_len, hidden_size]

class MembershipFunction(nn.Module):
    """Generates soft labels for multi-domain fake news detection."""
    def __init__(self, input_size, hidden_size):
        super(MembershipFunction, self).__init__()
        self.leap_gru = LeapGRU(input_size, hidden_size)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, 9),  # 9 domain labels (based on the paper)
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        leap_output = self.leap_gru(x)
        h = leap_output[:, -1, :]  # Final hidden state
        soft_labels = self.mlp(h)
        return soft_labels  # Soft labels [batch_size, 9]

class TextCNN(nn.Module):
    """Text CNN for feature extraction."""
    def __init__(self, input_size, num_classes):
        super(TextCNN, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv2d(1, 100, (k, input_size)) for k in [3, 4, 5]
        ])
        self.fc = nn.Linear(300, num_classes)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.unsqueeze(1)  # [batch_size, 1, seq_len, input_size]
        conv_results = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(result, result.size(2)).squeeze(2) for result in conv_results]
        cat = torch.cat(pooled, dim=1)
        out = self.dropout(self.fc(cat))
        return out  # [batch_size, num_classes]

class DomainGate(nn.Module):
    """Applies domain gate logic to weigh experts' outputs."""
    def __init__(self, input_size):
        super(DomainGate, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 9),  # 9 domains
            nn.Softmax(dim=-1)
        )

    def forward(self, g):
        alpha = self.mlp(g)
        return alpha  # Soft domain weights [batch_size, 9]

class Classifier(nn.Module):
    """Final classifier for detecting fake news."""
    def __init__(self, input_size):
        super(Classifier, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_size, 384),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(384, 1),
            nn.Sigmoid()
        )

    def forward(self, v):
        return self.mlp(v).squeeze(-1)  # Output fake/real label [batch_size]

class SLFENDModel(nn.Module):
    """Soft-label multi-domain fake news detection (SLFEND) model."""
    def __init__(self):
        super(SLFENDModel, self).__init__()
        self.bert = BertEmbedding()
        self.membership_function = MembershipFunction(input_size=768, hidden_size=256)
        self.experts = nn.ModuleList([TextCNN(768, 128) for _ in range(9)])  # 9 expert networks
        self.domain_gate = DomainGate(9)
        self.classifier = Classifier(128)  # Classify final weighted output

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask)  # [batch_size, seq_len, 768]
        soft_labels = self.membership_function(bert_output)  # [batch_size, 9]

        # Pass through expert networks
        expert_outputs = [expert(bert_output) for expert in self.experts]
        expert_outputs = torch.stack(expert_outputs, dim=1)  # [batch_size, 9, 128]

        # Apply domain gate weights
        alpha = self.domain_gate(soft_labels).unsqueeze(-1)  # [batch_size, 9, 1]
        v = (expert_outputs * alpha).sum(dim=1)  # Weighted sum of expert outputs

        # Classification
        y_hat = self.classifier(v)
        return y_hat  # [batch_size]

# --- Training and Evaluation ---
def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=2e-5):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss, total_acc = 0, 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = outputs.round().detach().cpu().numpy()
            acc = accuracy_score(labels.cpu().numpy(), preds)
            total_acc += acc

        val_loss, val_acc = evaluate_model(model, val_loader, criterion)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}, Acc: {total_acc/len(train_loader):.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

def evaluate_model(model, val_loader, criterion):
    model.eval()
    total_loss, total_acc = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = outputs.round().detach().cpu().numpy()
            acc = accuracy_score(labels.cpu().numpy(), preds)
            total_acc += acc

    return total_loss / len(val_loader), total_acc / len(val_loader)

# --- Example Usage ---
file_path = '/content/main_df.csv'
df = load_csv(file_path)

texts, labels = preprocess_data(df, 'Text', 'label')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 170

# Splitting the data into training, validation, and test sets
train_texts, train_labels = texts[:int(0.8*len(texts))], labels[:int(0.8*len(labels))]
val_texts, val_labels = texts[int(0.8*len(texts)):int(0.9*len(texts))], labels[int(0.8*len(labels)):int(0.9*len(labels))]
test_texts, test_labels = texts[int(0.9*len(texts)):], labels[int(0.9*len(labels)):]

train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_len)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize and train the model
model = SLFENDModel()
train_model(model, train_loader, val_loader)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/10, Loss: 0.0095, Acc: 0.9996, Val Loss: 0.0000, Val Acc: 1.0000
Epoch 2/10, Loss: 0.0000, Acc: 1.0000, Val Loss: 0.0000, Val Acc: 1.0000
Epoch 3/10, Loss: 0.0000, Acc: 1.0000, Val Loss: 0.0000, Val Acc: 1.0000


KeyboardInterrupt: 