In [1]:
import torch
from torch_geometric.data import HeteroData
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.nn import GCNConv, SAGEConv
import torch.nn.functional as F
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('../../data/BuzzFeedUser.txt', 'r') as f:
    user_list = [line.strip() for line in f.readlines()]
user_id_map = {i + 1: uid for i, uid in enumerate(user_list)}

In [3]:
with open('../../data/BuzzFeedNews.txt', 'r') as f:
    news_list = [line.strip() for line in f.readlines()]
news_id_map = {i + 1: nid for i, nid in enumerate(news_list)}

In [4]:
user_news_edges = []
counts = []

with open('../../data/BuzzFeedNewsUser.txt', 'r') as f:
    for line in f:
        news_id, user_id, count = map(int, line.strip().split())
        user_news_edges.append((user_id - 1, news_id - 1))
        counts.append(count)

In [5]:
max_count = max(counts)
normalized_counts = [c / max_count for c in counts]

In [6]:
user_user_edges = []

with open('../../data/BuzzFeedUserUser.txt', 'r') as f:
    for line in f:
        follower_id, followee_id = map(int, line.strip().split())
        user_user_edges.append((follower_id - 1, followee_id - 1))

In [7]:
fake_df = pd.read_csv("../../data/BuzzFeed_fake_news_content.csv")
real_df = pd.read_csv("../../data/BuzzFeed_real_news_content.csv")

In [8]:
fake_df['label'] = 1
real_df['label'] = 0
df = pd.concat([fake_df, real_df], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

news_content = (df['title'] + ' ' + df['text']).tolist()
news_labels = df['label'].tolist()

news_ids = df['id'].tolist()

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

vectorizer = TfidfVectorizer(max_features=300)
news_features = vectorizer.fit_transform(news_content).toarray()
news_features = torch.tensor(news_features, dtype=torch.float)

In [12]:
data = HeteroData()

num_users = len(user_id_map)
data['user'].x = torch.eye(num_users)

data['news'].x = news_features

data['news'].y = torch.tensor(news_labels, dtype=torch.long)

In [13]:
user_news_edge_index = torch.tensor(user_news_edges, dtype=torch.long).t().contiguous()
user_news_edge_attr = torch.tensor(normalized_counts, dtype=torch.float).unsqueeze(1)

data['user', 'spreads', 'news'].edge_index = user_news_edge_index
data['user', 'spreads', 'news'].edge_attr = user_news_edge_attr

user_user_edge_index = torch.tensor(user_user_edges, dtype=torch.long).t().contiguous()
data['user', 'follows', 'user'].edge_index = user_user_edge_index


In [14]:
class HeteroGCN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv = torch.nn.ModuleDict({
            'follows': GCNConv(-1, hidden_channels),
            'spreads': SAGEConv((-1, -1), hidden_channels),
        })
        
        self.lin = torch.nn.ModuleDict({
            'user': torch.nn.Linear(hidden_channels, hidden_channels),
            'news': torch.nn.Linear(hidden_channels, out_channels)
        })

    def forward(self, x_dict, edge_index_dict, edge_weight_dict=None):
        x_user = self.conv['follows'](
            x_dict['user'],
            edge_index_dict[('user', 'follows', 'user')],
            edge_weight=edge_weight_dict.get(('user', 'follows', 'user'))
        )
        
        x_news = self.conv['spreads'](
            (x_dict['user'], x_dict['news']),
            edge_index_dict[('user', 'spreads', 'news')]
        )
        
        out_dict = {
            'user': F.relu(self.lin['user'](x_user)),
            'news': F.relu(self.lin['news'](x_news)),
        }
        
        return out_dict

In [15]:
import random

num_news = data['news'].num_nodes
indices = list(range(num_news))
random.shuffle(indices)

train_size = int(0.7 * num_news)
val_size = int(0.15 * num_news)
test_size = num_news - train_size - val_size

train_id = indices[:train_size]
val_id = indices[train_size:train_size + val_size]
test_id = indices[train_size + val_size:]

train_mask = torch.zeros(num_news, dtype=torch.bool)
val_mask = torch.zeros(num_news, dtype=torch.bool)
test_mask = torch.zeros(num_news, dtype=torch.bool)

train_mask[train_id] = True
val_mask[val_id] = True
test_mask[test_id] = True

data['news'].train_mask = train_mask
data['news'].val_mask = val_mask
data['news'].test_mask = test_mask

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HeteroGCN(hidden_channels=64, out_channels=2).to(device)
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [17]:

for epoch in range(1, 101): 
    model.train()
    optimizer.zero_grad()
    
    out_dict = model(
        data.x_dict,
        data.edge_index_dict,
        edge_weight_dict={
            ('user', 'spreads', 'news'): data['user', 'spreads', 'news'].edge_attr.squeeze()
        }
    )

    out = out_dict['news']
    loss = criterion(out[data['news'].train_mask], data['news'].y[data['news'].train_mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d}, Loss: {loss.item():.4f}")

Epoch 001, Loss: 0.6964
Epoch 010, Loss: 0.3086
Epoch 020, Loss: 0.0249
Epoch 030, Loss: 0.0022
Epoch 040, Loss: 0.0005
Epoch 050, Loss: 0.0003
Epoch 060, Loss: 0.0002
Epoch 070, Loss: 0.0002
Epoch 080, Loss: 0.0002
Epoch 090, Loss: 0.0002
Epoch 100, Loss: 0.0001


In [18]:
model.eval()
out_dict = model(data.x_dict, data.edge_index_dict, {
    ('user', 'spreads', 'news'): data['user', 'spreads', 'news'].edge_attr.squeeze(),
    ('user', 'follows', 'user'): None
})

pred = out_dict['news'].argmax(dim=1)

y_pred = pred[data['news'].test_mask].cpu().numpy()
y_true = data['news'].y[data['news'].test_mask].cpu().numpy()

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["Real", "Fake"]))


Classification Report:
              precision    recall  f1-score   support

        Real       0.83      0.67      0.74        15
        Fake       0.69      0.85      0.76        13

    accuracy                           0.75        28
   macro avg       0.76      0.76      0.75        28
weighted avg       0.77      0.75      0.75        28



In [20]:
from sklearn.ensemble import RandomForestClassifier

model.eval()
with torch.no_grad():
    out_dict = model(data.x_dict, data.edge_index_dict, {
    ('user', 'spreads', 'news'): data['user', 'spreads', 'news'].edge_attr.squeeze(),
    ('user', 'follows', 'user'): None
})
    news_embeddings = out_dict['news'].cpu().numpy()

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(news_embeddings[train_id], data['news'].y[train_id])

y_pred = rf.predict(news_embeddings[test_id])
y_true = data['news'].y[test_id].cpu().numpy()

print("\nClassification Report:\n", classification_report(y_true, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.60      0.69        15
           1       0.65      0.85      0.73        13

    accuracy                           0.71        28
   macro avg       0.73      0.72      0.71        28
weighted avg       0.74      0.71      0.71        28

