In [None]:
pip install torch-geometric




In [None]:
import json
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.metrics import classification_report, accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load and extract the zip archive
zip_file_path = '/content/drive/MyDrive/Twibot-20.zip'
extract_dir = '/content/drive/MyDrive/social_network/MyTwiBot/Twibot-20'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
extract_dir = '/content/drive/MyDrive/social_network/MyTwiBot/Twibot-20'
def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

train_data = load_json(os.path.join(extract_dir, 'train.json'))
dev_data = load_json(os.path.join(extract_dir, 'dev.json'))
test_data = load_json(os.path.join(extract_dir, 'test.json'))
all_data = train_data + dev_data + test_data


In [None]:
pip install prettytable




In [None]:

# Feature Extraction
def extract_features(data):
    features = []
    for user in data:
        profile = user.get('profile', {})
        created_at = pd.to_datetime(profile.get('created_at', '1970-01-01'), errors='coerce')
        if created_at.tzinfo is None:
            created_at = created_at.tz_localize('UTC')
        tweets = user.get('tweet', [])
        if tweets is None:
            tweets = []
        tweet_text = " ".join(tweets) if tweets else ""
        avg_sentiment = TextBlob(tweet_text).sentiment.polarity if tweets else 0
        profile_features = {
            'followers_count': int(profile.get('followers_count', 0)),
            'friends_count': int(profile.get('friends_count', 0)),
            'statuses_count': int(profile.get('statuses_count', 0)),
            'verified': int(profile.get('verified', 'False').strip() == 'True'),
            'account_age_days': (pd.Timestamp.now(tz='UTC') - created_at).days,
            'description_length': len(profile.get('description', "")),
            'has_url': int(profile.get('url', 'None').strip() != 'None'),
            'has_profile_image': int(profile.get('profile_image_url', 'None').strip() != 'None'),
            'has_background_image': int(profile.get('profile_background_image_url', 'None').strip() != 'None'),
            'tweet_count': len(tweets),
            'avg_tweet_length': np.mean([len(twt) for twt in tweets]) if tweets else 0,
            'avg_sentiment': avg_sentiment
        }
        features.append(profile_features)
    return pd.DataFrame(features)

node_features = extract_features(all_data)

In [None]:
# Extract TF-IDF features from tweets
def extract_tweet_features(data):
    tweets = []
    for user in data:
        user_tweets = user.get('tweet', [])
        if user_tweets is None:
            user_tweets = [""]  # Add a blank string if there are no tweets
        tweets.append(" ".join(user_tweets))  # Combine all tweets of a user into a single string
    vectorizer = TfidfVectorizer(max_features=100)
    tfidf_matrix = vectorizer.fit_transform(tweets)
    return tfidf_matrix

tfidf_matrix = extract_tweet_features(all_data)

# Ensure the number of samples matches
assert node_features.shape[0] == tfidf_matrix.shape[0], "Mismatch in the number of samples between node features and TF-IDF features"

# Combine all features
node_features = np.hstack([node_features, tfidf_matrix.toarray()])

# Encode labels
labels = [user['label'] for user in all_data]
le = LabelEncoder()
labels = le.fit_transform(labels)


In [None]:

# Create a graph using NetworkX
G = nx.Graph()
for i, user in enumerate(all_data):
    G.add_node(i)
    neighbors = user.get('neighbor')
    if neighbors:
        for follower in neighbors.get('follower', []):
            follower_index = next((index for index, usr in enumerate(all_data) if usr['ID'] == follower), None)
            if follower_index is not None:
                G.add_edge(i, follower_index)

# Convert NetworkX graph to PyTorch Geometric data
edge_index = torch.tensor(list(G.edges)).t().contiguous()
x = torch.tensor(node_features, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)

data = Data(x=x, edge_index=edge_index, y=y)

# Split data into train, val, and test sets
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:len(train_data)] = True
val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
val_mask[len(train_data):len(train_data) + len(dev_data)] = True
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask[len(train_data) + len(dev_data):] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask


In [None]:
# GNN Model Definition
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(data.num_features, 64)
        self.conv2 = GCNConv(64, 32)
        self.conv3 = GCNConv(32, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)

# Train and Evaluate GNN Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    train_correct = pred[data.train_mask] == data.y[data.train_mask]
    val_correct = pred[data.val_mask] == data.y[data.val_mask]
    test_correct = pred[data.test_mask] == data.y[data.test_mask]
    train_acc = int(train_correct.sum()) / int(data.train_mask.sum())
    val_acc = int(val_correct.sum()) / int(data.val_mask.sum())
    test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
    return train_acc, val_acc, test_acc

for epoch in range(1000):
    loss = train()
    train_acc, val_acc, test_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')



Epoch: 000, Loss: 35193.7344, Train Acc: 0.4859, Val Acc: 0.5087, Test Acc: 0.4945
Epoch: 001, Loss: 33602.6094, Train Acc: 0.4847, Val Acc: 0.5061, Test Acc: 0.4954
Epoch: 002, Loss: 32011.4160, Train Acc: 0.4803, Val Acc: 0.5036, Test Acc: 0.4970
Epoch: 003, Loss: 30420.1562, Train Acc: 0.4790, Val Acc: 0.5011, Test Acc: 0.4996
Epoch: 004, Loss: 28829.1758, Train Acc: 0.4772, Val Acc: 0.4989, Test Acc: 0.4970
Epoch: 005, Loss: 27238.3242, Train Acc: 0.4755, Val Acc: 0.4981, Test Acc: 0.4937
Epoch: 006, Loss: 25647.5098, Train Acc: 0.4750, Val Acc: 0.4934, Test Acc: 0.4911
Epoch: 007, Loss: 24057.1758, Train Acc: 0.4719, Val Acc: 0.4884, Test Acc: 0.4869
Epoch: 008, Loss: 22467.4727, Train Acc: 0.4686, Val Acc: 0.4871, Test Acc: 0.4844
Epoch: 009, Loss: 20878.1699, Train Acc: 0.4661, Val Acc: 0.4850, Test Acc: 0.4835
Epoch: 010, Loss: 19289.0352, Train Acc: 0.4615, Val Acc: 0.4841, Test Acc: 0.4801
Epoch: 011, Loss: 17700.5938, Train Acc: 0.4586, Val Acc: 0.4816, Test Acc: 0.4801
Epoc

In [None]:
# Final evaluation
def final_evaluation():
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)
        train_correct = pred[data.train_mask] == data.y[data.train_mask]
        test_correct = pred[data.test_mask] == data.y[data.test_mask]
        train_acc = int(train_correct.sum()) / int(data.train_mask.sum())
        test_acc = int(test_correct.sum()) / int(data.test_mask.sum())

        y_train_true = data.y[data.train_mask].cpu()
        y_train_pred = pred[data.train_mask].cpu()
        y_test_true = data.y[data.test_mask].cpu()
        y_test_pred = pred[data.test_mask].cpu()

        train_report = classification_report(y_train_true, y_train_pred, output_dict=True)
        test_report = classification_report(y_test_true, y_test_pred, output_dict=True)

        return train_acc, test_acc, train_report, test_report

train_acc, test_acc, train_report, test_report = final_evaluation()

# Print evaluation metrics
print("Training Set Evaluation:")
print(pd.DataFrame(train_report).transpose())

print(" ")

print("Test Set Evaluation:")
print(pd.DataFrame(test_report).transpose())

Training Set Evaluation:
              precision    recall  f1-score      support
0              0.568166  0.671256  0.615423  3632.000000
1              0.700527  0.601162  0.647052  4646.000000
accuracy       0.631916  0.631916  0.631916     0.631916
macro avg      0.634346  0.636209  0.631238  8278.000000
weighted avg   0.642453  0.631916  0.633175  8278.000000
Test Set Evaluation:
              precision    recall  f1-score      support
0              0.593701  0.694291  0.640068   543.000000
1              0.697080  0.596875  0.643098   640.000000
accuracy       0.641589  0.641589  0.641589     0.641589
macro avg      0.645391  0.645583  0.641583  1183.000000
weighted avg   0.649629  0.641589  0.641707  1183.000000
