<a href="https://colab.research.google.com/github/Akankshaaaa-01/Assignment-Submission-Portal/blob/main/Graph_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install basic libraries
!pip install pandas scikit-learn matplotlib seaborn

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




In [4]:
# Load JSON files
with open('train.json', 'r') as f:
    train_data = json.load(f)

with open('test.json', 'r') as f:
    test_data = json.load(f)

with open('dev.json', 'r') as f:
    dev_data = json.load(f)

print(f"Train data: {len(train_data)} users")
print(f"Test data: {len(test_data)} users")
print(f"Dev data: {len(dev_data)} users")

# Pehle dekho data list hai ya dict
print(f"\nData type: {type(train_data)}")

# Ek sample dekho kaise dikhta hai
sample_user = train_data[0]  # Pehla user
print(f"\nSample user data:")
print(json.dumps(sample_user, indent=2)[:800])  # First 800 chars

Train data: 8278 users
Test data: 1183 users
Dev data: 2365 users

Data type: <class 'list'>

Sample user data:
{
  "ID": "17461978",
  "profile": {
    "id": "17461978 ",
    "id_str": "17461978 ",
    "name": "SHAQ ",
    "screen_name": "SHAQ ",
    "location": "Orlando, FL ",
    "profile_location": "{'id': '55b4f9e5c516e0b6', 'url': 'https://api.twitter.com/1.1/geo/id/55b4f9e5c516e0b6.json', 'place_type': 'unknown', 'name': 'Orlando, FL', 'full_name': 'Orlando, FL', 'country_code': '', 'country': '', 'contained_within': [], 'bounding_box': None, 'attributes': {}} ",
    "description": "VERY QUOTATIOUS, I PERFORM RANDOM ACTS OF SHAQNESS ",
    "url": "https://t.co/7hsiK8cCKW ",
    "entities": "{'url': {'urls': [{'url': 'https://t.co/7hsiK8cCKW', 'expanded_url': 'http://www.ShaqFuRadio.com', 'display_url': 'ShaqFuRadio.com', 'indices': [0, 23]}]}, 'description': {'urls': []}} ",
    "protected": 


In [6]:
def extract_simple_features(user_data):
    """
    TwitBot-20 format ke liye features (with type conversion)
    """
    features = []

    # Profile data nikalo
    profile = user_data.get('profile', {})

    # Helper function to safely convert to int
    def safe_int(value, default=0):
        try:
            if isinstance(value, str):
                value = value.strip()
            return int(value)
        except:
            return default

    # Basic profile features (convert to int)
    followers_count = safe_int(profile.get('followers_count', 0))
    friends_count = safe_int(profile.get('friends_count', 0))
    statuses_count = safe_int(profile.get('statuses_count', 0))
    favourites_count = safe_int(profile.get('favourites_count', 0))
    listed_count = safe_int(profile.get('listed_count', 0))

    features.append(followers_count)
    features.append(friends_count)
    features.append(statuses_count)
    features.append(favourites_count)
    features.append(listed_count)

    # Account features (boolean to int)
    verified = profile.get('verified', False)
    features.append(1 if (verified == True or verified == 'True' or verified == 'true') else 0)

    default_profile = profile.get('default_profile', False)
    features.append(1 if (default_profile == True or default_profile == 'True' or default_profile == 'true') else 0)

    default_profile_image = profile.get('default_profile_image', False)
    features.append(1 if (default_profile_image == True or default_profile_image == 'True' or default_profile_image == 'true') else 0)

    protected = profile.get('protected', False)
    features.append(1 if (protected == True or protected == 'True' or protected == 'true') else 0)

    # Text features
    description = str(profile.get('description', ''))
    name = str(profile.get('name', ''))
    screen_name = str(profile.get('screen_name', ''))

    features.append(len(description))  # Description length
    features.append(len(name))  # Name length
    features.append(len(screen_name))  # Screen name length

    # Check if URL exists
    url = profile.get('url', '')
    features.append(1 if url and url != 'None' and url.strip() != '' else 0)

    # Ratios (with safe division)
    # Following/Follower ratio
    if followers_count > 0:
        features.append(friends_count / followers_count)
    else:
        features.append(friends_count)  # Agar followers 0 hai toh directly friends count

    # Tweets per follower
    if followers_count > 0:
        features.append(statuses_count / followers_count)
    else:
        features.append(0)

    # Follower/Friend ratio
    if friends_count > 0:
        features.append(followers_count / friends_count)
    else:
        features.append(followers_count)

    return features

# Test karo
print("Testing feature extraction...")
sample_features = extract_simple_features(train_data[0])
print(f"Sample features: {sample_features}")
print(f"Total features: {len(sample_features)}")
print("\nFeature names:")
feature_names = [
    'followers_count', 'friends_count', 'statuses_count', 'favourites_count',
    'listed_count', 'verified', 'default_profile', 'default_profile_image',
    'protected', 'description_length', 'name_length', 'screen_name_length',
    'has_url', 'friends_followers_ratio', 'tweets_per_follower', 'followers_friends_ratio'
]
for name, value in zip(feature_names, sample_features):
    print(f"  {name}: {value}")

Testing feature extraction...
Sample features: [15349596, 692, 9798, 142, 45568, 0, 0, 0, 0, 51, 5, 5, 1, 4.508261976406415e-05, 0.0006383229890871395, 22181.49710982659]
Total features: 16

Feature names:
  followers_count: 15349596
  friends_count: 692
  statuses_count: 9798
  favourites_count: 142
  listed_count: 45568
  verified: 0
  default_profile: 0
  default_profile_image: 0
  protected: 0
  description_length: 51
  name_length: 5
  screen_name_length: 5
  has_url: 1
  friends_followers_ratio: 4.508261976406415e-05
  tweets_per_follower: 0.0006383229890871395
  followers_friends_ratio: 22181.49710982659


In [11]:
def create_dataset(data_list):
    """
    TwitBot-20 list format se features aur labels banao
    """
    X = []
    y = []
    user_ids = []
    skipped = 0

    for user_info in data_list:
        try:
            features = extract_simple_features(user_info)

            # Label ko int mein convert karo
            label = user_info.get('label', 0)
            label = int(label)  # String se int

            X.append(features)
            y.append(label)  # bot=1, human=0

            # User ID
            user_id = user_info.get('ID', '')
            user_ids.append(user_id)

        except Exception as e:
            skipped += 1
            print(f"Error: {e}")
            continue

    if skipped > 0:
        print(f"⚠️ Skipped {skipped} users due to errors")

    return np.array(X), np.array(y, dtype=int), user_ids  # dtype=int important!

# Create datasets
print("Creating train dataset...")
X_train, y_train, train_ids = create_dataset(train_data)

print("\nCreating test dataset...")
X_test, y_test, test_ids = create_dataset(test_data)

print("\nCreating dev dataset...")
X_dev, y_dev, dev_ids = create_dataset(dev_data)

print(f"\n{'='*60}")
print(f"📊 DATASET SUMMARY")
print(f"{'='*60}")
print(f"Train: {X_train.shape}")
print(f"  🤖 Bots: {np.sum(y_train)} ({np.sum(y_train)/len(y_train)*100:.1f}%)")
print(f"  👤 Humans: {len(y_train) - np.sum(y_train)} ({(len(y_train)-np.sum(y_train))/len(y_train)*100:.1f}%)")

print(f"\nTest: {X_test.shape}")
print(f"  🤖 Bots: {np.sum(y_test)} ({np.sum(y_test)/len(y_test)*100:.1f}%)")
print(f"  👤 Humans: {len(y_test) - np.sum(y_test)} ({(len(y_test)-np.sum(y_test))/len(y_test)*100:.1f}%)")

print(f"\nDev: {X_dev.shape}")
print(f"  🤖 Bots: {np.sum(y_dev)} ({np.sum(y_dev)/len(y_dev)*100:.1f}%)")
print(f"  👤 Humans: {len(y_dev) - np.sum(y_dev)} ({(len(y_dev)-np.sum(y_dev))/len(y_dev)*100:.1f}%)")
print(f"{'='*60}")

Creating train dataset...

Creating test dataset...

Creating dev dataset...

📊 DATASET SUMMARY
Train: (8278, 16)
  🤖 Bots: 4646 (56.1%)
  👤 Humans: 3632 (43.9%)

Test: (1183, 16)
  🤖 Bots: 640 (54.1%)
  👤 Humans: 543 (45.9%)

Dev: (2365, 16)
  🤖 Bots: 1303 (55.1%)
  👤 Humans: 1062 (44.9%)


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Train Random Forest
print("Training Random Forest Model...")
print("="*60)

rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    max_depth=20
)
rf_model.fit(X_train, y_train)

# Predictions
print("Making predictions...")
y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)
y_pred_dev = rf_model.predict(X_dev)

# Results
print(f"\n{'='*60}")
print(f"🎯 RANDOM FOREST RESULTS")
print(f"{'='*60}")
print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train):.4f} ({accuracy_score(y_train, y_pred_train)*100:.2f}%)")
print(f"Dev Accuracy:   {accuracy_score(y_dev, y_pred_dev):.4f} ({accuracy_score(y_dev, y_pred_dev)*100:.2f}%)")
print(f"Test Accuracy:  {accuracy_score(y_test, y_pred_test):.4f} ({accuracy_score(y_test, y_pred_test)*100:.2f}%)")
print(f"{'='*60}")

# Detailed metrics for test set
print("\n📊 Test Set Detailed Metrics:")
print("-"*60)
print(f"Precision: {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_test):.4f}")
print(f"F1-Score:  {f1_score(y_test, y_pred_test):.4f}")

print("\n📋 Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test, target_names=['Human', 'Bot']))

Training Random Forest Model...
Making predictions...

🎯 RANDOM FOREST RESULTS
Train Accuracy: 0.9967 (99.67%)
Dev Accuracy:   0.7455 (74.55%)
Test Accuracy:  0.7608 (76.08%)

📊 Test Set Detailed Metrics:
------------------------------------------------------------
Precision: 0.7339
Recall:    0.8750
F1-Score:  0.7983

📋 Classification Report (Test Set):
              precision    recall  f1-score   support

       Human       0.81      0.63      0.71       543
         Bot       0.73      0.88      0.80       640

    accuracy                           0.76      1183
   macro avg       0.77      0.75      0.75      1183
weighted avg       0.77      0.76      0.76      1183

