In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m0.7/1.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [3]:
import json
import os
import zipfile
import pandas as pd
import numpy as np
import networkx as nx
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Step 1: Load and extract the zip archive
zip_file_path = '/content/drive/MyDrive/social_network/TwiBot-20/Twibot-20.zip'
extract_dir = '/content/drive/MyDrive/social_network/MyTwiBot'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [4]:
# Step 2: Load the JSON files
extract_dir = '/content/drive/MyDrive/social_network/MyTwiBot/Twibot-20'
def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

train_data = load_json(os.path.join(extract_dir, 'train.json'))
dev_data = load_json(os.path.join(extract_dir, 'dev.json'))
test_data = load_json(os.path.join(extract_dir, 'test.json'))
all_data = train_data + dev_data + test_data


In [8]:
# Step 3: Feature Extraction
def extract_features(data):
    features = []
    for user in data:
        profile = user.get('profile', {})
        created_at = pd.to_datetime(profile.get('created_at', '1970-01-01'), errors='coerce')
        if created_at.tzinfo is None:
            created_at = created_at.tz_localize('UTC')
        profile_features = {
            'followers_count': int(profile.get('followers_count', 0)),
            'friends_count': int(profile.get('friends_count', 0)),
            'statuses_count': int(profile.get('statuses_count', 0)),
            'verified': int(profile.get('verified', 'False').strip() == 'True'),
            'account_age_days': (pd.Timestamp.now(tz='UTC') - created_at).days
        }
        features.append(profile_features)
    return pd.DataFrame(features)

node_features = extract_features(all_data)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 11826 and the array at index 1 has size 1999788

In [9]:
# Extract TF-IDF features from tweets
def extract_tweet_features(data):
    tweets = []
    for user in data:
        user_tweets = user.get('tweet', [])
        if user_tweets is None:
            user_tweets = [""]  # Add a blank string if there are no tweets
        tweets.append(" ".join(user_tweets))  # Combine all tweets of a user into a single string
    vectorizer = TfidfVectorizer(max_features=100)
    tfidf_matrix = vectorizer.fit_transform(tweets)
    return tfidf_matrix

tfidf_matrix = extract_tweet_features(all_data)

# Ensure the number of samples matches
assert node_features.shape[0] == tfidf_matrix.shape[0], "Mismatch in the number of samples between node features and TF-IDF features"

# Combine all features
node_features = np.hstack([node_features, tfidf_matrix.toarray()])

# Encode labels
labels = [user['label'] for user in all_data]
le = LabelEncoder()
labels = le.fit_transform(labels)

In [10]:
# Step 4: Create Graph
G = nx.Graph()
for i, user in enumerate(all_data):
    G.add_node(i)
    neighbors = user.get('neighbor', {})
    if neighbors:
        followers = neighbors.get('follower', [])
        if followers:
            for follower in followers:
                follower_index = next((index for index, usr in enumerate(all_data) if usr['ID'] == follower), None)
                if follower_index is not None:
                    G.add_edge(i, follower_index)

# Convert to PyTorch Geometric data
edge_index = torch.tensor(list(G.edges)).t().contiguous()
x = torch.tensor(node_features, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)


In [12]:
# Create masks for train, validation, and test sets
num_nodes = x.size(0)
train_size = len(train_data)
val_size = len(dev_data)
test_size = len(test_data)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[:train_size] = True
val_mask[train_size:train_size + val_size] = True
test_mask[train_size + val_size:] = True

data = Data(x=x, edge_index=edge_index, y=y)
data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

In [23]:
from sklearn.metrics import classification_report, accuracy_score

# Step 5: GNN Model
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(data.num_features, 64)
        self.conv2 = GCNConv(64, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

model = GNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(mask):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out[mask].max(dim=1)[1]
        y_true = data.y[mask]
        y_pred = pred
        acc = accuracy_score(y_true.cpu(), y_pred.cpu())
        report = classification_report(y_true.cpu(), y_pred.cpu(), output_dict=True)
    return acc, report

for epoch in range(1000):
    loss = train()
    train_acc, train_report = evaluate(data.train_mask)
    test_acc, test_report = evaluate(data.test_mask)
    print(f'Epoch: {epoch}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

# Final evaluation
train_acc, train_report = evaluate(data.train_mask)
test_acc, test_report = evaluate(data.test_mask)

print("Training Set Evaluation:")
print(pd.DataFrame(train_report).transpose())

print("Test Set Evaluation:")
print(pd.DataFrame(test_report).transpose())

Epoch: 0, Loss: 80020.7500, Train Acc: 0.4435, Test Acc: 0.4210
Epoch: 1, Loss: 79280.8125, Train Acc: 0.4424, Test Acc: 0.4227
Epoch: 2, Loss: 78541.1875, Train Acc: 0.4410, Test Acc: 0.4235
Epoch: 3, Loss: 77801.9219, Train Acc: 0.4404, Test Acc: 0.4235
Epoch: 4, Loss: 77062.9375, Train Acc: 0.4394, Test Acc: 0.4218
Epoch: 5, Loss: 76324.2344, Train Acc: 0.4385, Test Acc: 0.4235
Epoch: 6, Loss: 75585.8047, Train Acc: 0.4373, Test Acc: 0.4218
Epoch: 7, Loss: 74847.6328, Train Acc: 0.4367, Test Acc: 0.4193
Epoch: 8, Loss: 74109.7188, Train Acc: 0.4366, Test Acc: 0.4176
Epoch: 9, Loss: 73372.1250, Train Acc: 0.4340, Test Acc: 0.4150
Epoch: 10, Loss: 72634.8203, Train Acc: 0.4339, Test Acc: 0.4142
Epoch: 11, Loss: 71897.8125, Train Acc: 0.4334, Test Acc: 0.4091
Epoch: 12, Loss: 71161.1094, Train Acc: 0.4325, Test Acc: 0.4074
Epoch: 13, Loss: 70424.6797, Train Acc: 0.4309, Test Acc: 0.4057
Epoch: 14, Loss: 69688.5156, Train Acc: 0.4303, Test Acc: 0.4024
Epoch: 15, Loss: 68952.7031, Train 