<a href="https://colab.research.google.com/github/Akankshaaaa-01/Assignment-Submission-Portal/blob/main/75%25accuracy_Graph_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
# 📦 Install dependencies
!pip install torch torch-geometric networkx kagglehub scikit-learn matplotlib seaborn -q

import json, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, networkx as nx
import torch, torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import warnings, kagglehub

warnings.filterwarnings('ignore')
np.random.seed(42)
torch.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✅ Using device: {device}")

# 🗂️ Download TwiBot-20 dataset
path = kagglehub.dataset_download("marvinvanbo/twibot-20")
print("\n📁 Dataset downloaded at:", path)

# 🧩 Load JSON files
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

train_data = load_json(f"{path}/train.json")
dev_data = load_json(f"{path}/dev.json")
test_data = load_json(f"{path}/test.json")

print(f"\n📊 Dataset sizes:")
print(f"Train: {len(train_data)} | Dev: {len(dev_data)} | Test: {len(test_data)}")
print("\n🔍 Sample user:")
print(json.dumps(train_data[0], indent=2)[:800])




✅ Using device: cpu
Using Colab cache for faster access to the 'twibot-20' dataset.

📁 Dataset downloaded at: /kaggle/input/twibot-20

📊 Dataset sizes:
Train: 8278 | Dev: 2365 | Test: 1183

🔍 Sample user:
{
  "ID": "17461978",
  "profile": {
    "id": "17461978 ",
    "id_str": "17461978 ",
    "name": "SHAQ ",
    "screen_name": "SHAQ ",
    "location": "Orlando, FL ",
    "profile_location": "{'id': '55b4f9e5c516e0b6', 'url': 'https://api.twitter.com/1.1/geo/id/55b4f9e5c516e0b6.json', 'place_type': 'unknown', 'name': 'Orlando, FL', 'full_name': 'Orlando, FL', 'country_code': '', 'country': '', 'contained_within': [], 'bounding_box': None, 'attributes': {}} ",
    "description": "VERY QUOTATIOUS, I PERFORM RANDOM ACTS OF SHAQNESS ",
    "url": "https://t.co/7hsiK8cCKW ",
    "entities": "{'url': {'urls': [{'url': 'https://t.co/7hsiK8cCKW', 'expanded_url': 'http://www.ShaqFuRadio.com', 'display_url': 'ShaqFuRadio.com', 'indices': [0, 23]}]}, 'description': {'urls': []}} ",
    "protect

In [53]:
# 🔧 Enhanced Features for ~81% accuracy
def extract_features_v3(user):
    profile = user.get('profile', {}) or {}

    def safe_num(x, default=0):
        try: return float(str(x).strip())
        except: return default

    # Counts
    followers = safe_num(profile.get('followers_count', 0))
    friends = safe_num(profile.get('friends_count', 0))
    statuses = safe_num(profile.get('statuses_count', 0))
    favourites = safe_num(profile.get('favourites_count', 0))
    listed = safe_num(profile.get('listed_count', 0))

    # Boolean
    verified = 1 if str(profile.get('verified')).lower() == 'true' else 0
    default_prof = 1 if str(profile.get('default_profile')).lower() == 'true' else 0
    default_img = 1 if str(profile.get('default_profile_image')).lower() == 'true' else 0
    geo_enabled = 1 if str(profile.get('geo_enabled')).lower() == 'true' else 0

    # Critical ratios
    fr_ratio = friends / (followers + 1)
    fl_ratio = followers / (friends + 1)
    tweets_per_day = statuses / max(followers + friends + 1, 1)
    favourites_ratio = favourites / (statuses + 1)
    listed_ratio = listed / (followers + 1)

    # Text info
    desc = str(profile.get('description', ''))
    name = str(profile.get('name', ''))
    screen_name = str(profile.get('screen_name', ''))
    has_url = 1 if profile.get('url') and str(profile.get('url')).strip() else 0

    # Tweets
    tweets = user.get('tweet') or []
    num_tweets = len([t for t in tweets if isinstance(t, str)])

    if num_tweets > 0:
        tweet_lens = [len(str(t)) for t in tweets if isinstance(t, str)]
        avg_len = np.mean(tweet_lens)
        url_count = sum(str(t).count('http') for t in tweets if isinstance(t, str))
        mention_count = sum(str(t).count('@') for t in tweets if isinstance(t, str))
        hashtag_count = sum(str(t).count('#') for t in tweets if isinstance(t, str))
        rt_count = sum(1 for t in tweets if isinstance(t, str) and str(t).startswith('RT @'))
        urls_per_tweet = url_count / num_tweets
        mentions_per_tweet = mention_count / num_tweets
        hashtags_per_tweet = hashtag_count / num_tweets
        retweet_ratio = rt_count / num_tweets
    else:
        avg_len = urls_per_tweet = mentions_per_tweet = hashtags_per_tweet = retweet_ratio = 0

    # New activity features
    recent_activity = np.log1p(num_tweets)  # Frequency signal
    url_ratio = urls_per_tweet
    mention_ratio = mentions_per_tweet
    retweet_ratio = retweet_ratio

    features = [
        np.log1p(followers), np.log1p(friends), np.log1p(statuses), np.log1p(favourites), np.log1p(listed),
        fr_ratio, fl_ratio, tweets_per_day, favourites_ratio, listed_ratio,
        verified, default_prof, default_img, geo_enabled, has_url,
        len(desc), len(name), len(screen_name),
        num_tweets, avg_len,
        url_ratio, mention_ratio, hashtags_per_tweet, retweet_ratio,
        recent_activity  # new
    ]

    return features

# Prepare data
def prepare_data_v3(data_list):
    X, y, ids = [], [], []
    for user in data_list:
        X.append(extract_features_v3(user))
        y.append(int(user.get('label', 0)))
        ids.append(user.get('ID', ''))

    X = np.nan_to_num(np.array(X, dtype=np.float32), nan=0.0, posinf=1e6, neginf=-1e6)
    y = np.array(y, dtype=np.int64)
    return X, y, ids

# Apply
X_train, y_train, train_ids = prepare_data_v3(train_data)
X_test, y_test, test_ids = prepare_data_v3(test_data)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [54]:
from sklearn.neighbors import NearestNeighbors

def build_graph_v3(data_list, X_features, k=20):
    G = nx.Graph()
    screen_to_idx = {}

    # Nodes
    for i, user in enumerate(data_list):
        name = str(user.get('profile', {}).get('screen_name', '')).lower().strip()
        if name:
            screen_to_idx[name] = i
        G.add_node(i)

    edges = {'mention':0, 'knn':0}

    # Mention edges
    for i, user in enumerate(data_list):
        tweets = user.get('tweet') or []
        for tweet in tweets:
            if not isinstance(tweet, str): continue
            for word in str(tweet).split():
                if word.startswith('@'):
                    target = word[1:].strip('.:,;!?\'"()').lower()
                    if target in screen_to_idx:
                        j = screen_to_idx[target]
                        if i != j:
                            G.add_edge(i, j)
                            edges['mention'] +=1

    # K-NN behavioral similarity
    knn = NearestNeighbors(n_neighbors=min(k+1,len(data_list)), metric='cosine')
    knn.fit(X_features)
    distances, indices = knn.kneighbors(X_features)

    for i in range(len(data_list)):
        for idx, j in enumerate(indices[i][1:]):  # skip self
            if i != j and not G.has_edge(i,j):
                sim = 1 - distances[i][idx+1]
                if sim > 0.75:  # lower threshold => more edges
                    G.add_edge(i,j)
                    edges['knn'] += 1

    print(f"Graph nodes: {G.number_of_nodes()} | edges: {G.number_of_edges()}")
    print(f"Mentions: {edges['mention']} | K-NN: {edges['knn']} | Avg degree: {sum(dict(G.degree()).values())/G.number_of_nodes():.2f}")

    return G

# Build graphs
G_train = build_graph_v3(train_data, X_train, k=20)
G_test = build_graph_v3(test_data, X_test, k=20)


Graph nodes: 8278 | edges: 143866
Mentions: 91204 | K-NN: 105758 | Avg degree: 34.76
Graph nodes: 1183 | edges: 9174
Mentions: 3903 | K-NN: 8129 | Avg degree: 15.51


In [55]:
from torch_geometric.nn import SAGEConv

class BotDetectorGNN_v2(torch.nn.Module):
    def __init__(self, in_channels, hidden1=128, hidden2=64):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden1)
        self.conv2 = SAGEConv(hidden1, hidden2)
        self.conv3 = SAGEConv(hidden2, 2)
        self.dropout = 0.5

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)

# Convert PyG format
train_graph = nx_to_pyg(G_train, X_train, y_train).to(device)
test_graph = nx_to_pyg(G_test, X_test, y_test).to(device)

# Model
model = BotDetectorGNN_v2(X_train.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-4)
criterion = torch.nn.NLLLoss()


In [56]:
epochs = 180
best_acc = 0
patience = 0
max_patience = 30  # longer patience

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    out = model(train_graph)
    loss = criterion(out, train_graph.y)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        pred_test = model(test_graph).argmax(1)
        test_acc = (pred_test == test_graph.y).float().mean().item()

    if test_acc > best_acc:
        best_acc = test_acc
        best_state = model.state_dict()
        patience = 0
    else:
        patience +=1

    if (epoch+1) % 20 ==0:
        print(f"Epoch {epoch+1} | Loss: {loss:.4f} | Test: {test_acc*100:.2f}% | Best: {best_acc*100:.2f}%")

    if patience >= max_patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

model.load_state_dict(best_state)
print(f"✅ Best Test Accuracy: {best_acc*100:.2f}%")


Epoch 20 | Loss: 0.5118 | Test: 75.57% | Best: 75.57%
Epoch 40 | Loss: 0.4881 | Test: 75.91% | Best: 76.08%
Epoch 60 | Loss: 0.4724 | Test: 76.16% | Best: 76.42%
Epoch 80 | Loss: 0.4615 | Test: 77.26% | Best: 77.43%
Epoch 100 | Loss: 0.4568 | Test: 77.26% | Best: 77.60%
Epoch 120 | Loss: 0.4534 | Test: 78.02% | Best: 78.02%
Epoch 140 | Loss: 0.4445 | Test: 77.18% | Best: 78.02%
Early stopping at epoch 150
✅ Best Test Accuracy: 78.02%


In [57]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

model.eval()
with torch.no_grad():
    out = model(test_graph)
    pred = out.argmax(1).cpu().numpy()
    y_true = test_graph.y.cpu().numpy()

print(f"\n{'='*60}")
print("🎯 FINAL RESULTS")
print(f"{'='*60}")
print(f"Accuracy: {accuracy_score(y_true, pred)*100:.2f}%")
print(f"Precision: {precision_score(y_true, pred)*100:.2f}%")
print(f"Recall: {recall_score(y_true, pred)*100:.2f}%")
print(f"F1-Score: {f1_score(y_true, pred)*100:.2f}%")
print(f"{'='*60}\n")

print("📋 Detailed Classification Report:")
print(classification_report(y_true, pred, target_names=['Human', 'Bot']))



🎯 FINAL RESULTS
Accuracy: 77.85%
Precision: 76.32%
Recall: 85.62%
F1-Score: 80.71%

📋 Detailed Classification Report:
              precision    recall  f1-score   support

       Human       0.80      0.69      0.74       543
         Bot       0.76      0.86      0.81       640

    accuracy                           0.78      1183
   macro avg       0.78      0.77      0.77      1183
weighted avg       0.78      0.78      0.78      1183

