In [32]:
!pip install gdown
!pip install zipfile36



In [33]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
import json
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [34]:
# Load and extract the zip archive
zip_file_path = '/content/drive/MyDrive/social_network/TwiBot-20/Twibot-20.zip'
extract_dir = '/content/drive/MyDrive/social_network/MyTwiBot'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [38]:
# Load the JSON files
extract_dir = '/content/drive/MyDrive/social_network/MyTwiBot/Twibot-20'
def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

train_data = load_json(os.path.join(extract_dir, 'train.json'))
dev_data = load_json(os.path.join(extract_dir, 'dev.json'))
test_data = load_json(os.path.join(extract_dir, 'test.json'))
all_data = train_data + dev_data + test_data



In [40]:

# Feature Extraction
def extract_features(data):
    features = []
    for user in data:
        profile = user.get('profile', {})
        created_at = pd.to_datetime(profile.get('created_at', '1970-01-01'), errors='coerce')
        if created_at.tzinfo is None:
            created_at = created_at.tz_localize('UTC')
        profile_features = {
            'followers_count': int(profile.get('followers_count', 0)),
            'friends_count': int(profile.get('friends_count', 0)),
            'statuses_count': int(profile.get('statuses_count', 0)),
            'verified': int(profile.get('verified', 'False').strip() == 'True'),
            'account_age_days': (pd.Timestamp.now(tz='UTC') - created_at).days
        }
        features.append(profile_features)
    return pd.DataFrame(features)

node_features = extract_features(all_data)

# Extract TF-IDF features from tweets
def extract_tweet_features(data):
    tweets = []
    for user in data:
        user_tweets = user.get('tweet', [])
        if user_tweets is None:
            user_tweets = [""]  # Add a blank string if there are no tweets
        tweets.append(" ".join(user_tweets))  # Combine all tweets of a user into a single string
    vectorizer = TfidfVectorizer(max_features=100)
    tfidf_matrix = vectorizer.fit_transform(tweets)
    return tfidf_matrix

tfidf_matrix = extract_tweet_features(all_data)

# Ensure the number of samples matches
assert node_features.shape[0] == tfidf_matrix.shape[0], "Mismatch in the number of samples between node features and TF-IDF features"

# Combine all features
node_features = np.hstack([node_features, tfidf_matrix.toarray()])

# Encode labels
labels = [user['label'] for user in all_data]
le = LabelEncoder()
labels = le.fit_transform(labels)

# Split the node features and labels into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(node_features, labels, test_size=0.3, random_state=42)

# Initialize and train the Random Forest Classifier with adjusted hyperparameters
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate the model
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

train_report = classification_report(y_train, y_train_pred, output_dict=True)
test_report = classification_report(y_test, y_test_pred, output_dict=True)

print("Random Forest Training Set Evaluation:")
print(pd.DataFrame(train_report).transpose())

print("Random Forest Test Set Evaluation:")
print(pd.DataFrame(test_report).transpose())



Random Forest Training Set Evaluation:
              precision    recall  f1-score      support
0              0.960445  0.806036  0.876492  3645.000000
1              0.864533  0.973883  0.915956  4633.000000
accuracy       0.899976  0.899976  0.899976     0.899976
macro avg      0.912489  0.889959  0.896224  8278.000000
weighted avg   0.906765  0.899976  0.898579  8278.000000
Random Forest Test Set Evaluation:
              precision    recall  f1-score      support
0              0.911719  0.733040  0.812674  1592.000000
1              0.812610  0.942229  0.872633  1956.000000
accuracy       0.848365  0.848365  0.848365     0.848365
macro avg      0.862164  0.837635  0.842653  3548.000000
weighted avg   0.857081  0.848365  0.845729  3548.000000
