In [1]:
import pandas as pd
import json

# ── Load all splits ───────────────────────────────────────────────────────────
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

train_data = load_json("train.json")
dev_data   = load_json("dev.json")
test_data  = load_json("test.json")

FileNotFoundError: [Errno 2] No such file or directory: 'train.json'

In [None]:
def parse_entries(data, split_name):
    rows = []
    for entry in data:
        profile = entry.get('profile', {}) or {}
        tweets  = entry.get('tweet', [])  or []

        row = {
            # Identity
            'ID'              : entry.get('ID'),
            'split'           : split_name,
            'label'           : int(entry.get('label')),  # ← fixed


            # Profile features
            'screen_name'     : profile.get('screen_name', ''),
            'name'            : profile.get('name', ''),
            'description'     : profile.get('description', ''),
            'location'        : profile.get('location', ''),
            'followers_count' : profile.get('followers_count', 0),
            'friends_count'   : profile.get('friends_count', 0),
            'statuses_count'  : profile.get('statuses_count', 0),
            'favourites_count': profile.get('favourites_count', 0),
            'listed_count'    : profile.get('listed_count', 0),
            'verified'        : profile.get('verified', False),
            'created_at'      : profile.get('created_at', ''),
            'default_profile' : profile.get('default_profile', False),
            'default_profile_image': profile.get('default_profile_image', False),

            # Tweet features
            'tweet_count'     : len(tweets),
            'tweets': [t for t in tweets if isinstance(t, str)],

            # Graph features
            'neighbors'       : entry.get('neighbor', {}) or {},
            'domain'          : entry.get('domain', ''),
        }
        rows.append(row)
    return rows


In [None]:
train_rows = parse_entries(train_data, 'train')
dev_rows   = parse_entries(dev_data,   'dev')
test_rows  = parse_entries(test_data,  'test')
train_df = pd.DataFrame(train_rows + dev_rows)  # combine train and dev
test_df  = pd.DataFrame(test_rows)

train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv",  index=False)


In [None]:
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")


In [None]:
## Importing the modules
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


In [None]:
import re
# ── Feature Engineering ───────────────────────────────────────────────────────
def create_features3(df):
    df = df.copy()

    # 1. Followers / Friends ratio
    df['followers_count'] = pd.to_numeric(df['followers_count'], errors='coerce').fillna(0)
    df['friends_count']   = pd.to_numeric(df['friends_count'],   errors='coerce').fillna(0)
    df['follow_ratio']    = df['followers_count'] / (df['friends_count'] + 1)  # +1 to avoid division by zero

    df['default_profile']       = df['default_profile']
    df['default_profile_image'] = df['default_profile_image']
    df['verified']              = df['verified']

    # 3. Description length
    df['description_length'] = df['description'].fillna('').apply(len)

    # 4. Verified (boolean → int)


    # 5. Numeric columns
    df['statuses_count'] = pd.to_numeric(df['statuses_count'], errors='coerce').fillna(0)
    df['listed_count']   = pd.to_numeric(df['listed_count'],   errors='coerce').fillna(0)

    from datetime import datetime, timezone

    df['account_age_days'] = df['created_at'].apply(
    lambda x: (datetime.now(timezone.utc) - pd.to_datetime(x, errors='coerce', utc=True)).days
    if pd.notnull(x) else 0
    )


    # Tweets par jour (activité)
    df['tweets_per_day'] = df['statuses_count'] / (df['account_age_days'] + 1)
        # Convertir la colonne tweets depuis string CSV vers liste
    df['tweets'] = df['tweets'].apply(
        lambda x: eval(x) if isinstance(x, str) else x
    )

        # Ratio de retweets
    df['retweet_ratio'] = df['tweets'].apply(
        lambda x: sum(1 for t in x if t.startswith('RT ')) / (len(x) + 1)
    )
     # Ratio de tweets avec URLs
    df['url_ratio'] = df['tweets'].apply(
        lambda x: sum(1 for t in x if 'https://' in t) / (len(x) + 1)
    )
    # Longueur moyenne des tweets
    df['avg_tweet_length'] = df['tweets'].apply(
        lambda x: sum(len(t) for t in x) / (len(x) + 1)
    )
        # Diversité des hashtags
    df['hashtag_diversity'] = df['tweets'].apply(
        lambda x: len(set(re.findall(r'#\w+', ' '.join(x)))) / (len(x) + 1)
    )


    return df

new_train_df2 = create_features3(train_df)
new_test_df2  = create_features3(test_df)

In [None]:
# ── Define Features & Target ──────────────────────────────────────────────────
features = [
    'follow_ratio',
    'default_profile',
    'default_profile_image',
    'description_length',
    'statuses_count',
    'verified',
    'listed_count',
    'account_age_days',   # ← ajouter
    'tweets_per_day',
    'retweet_ratio',
    'url_ratio',
    'avg_tweet_length',
    'hashtag_diversity'

]

X_train = new_train_df2[features]
y_train = new_train_df2['label']

X_test  = new_test_df2[features]
y_test  = new_test_df2['label']


bool_cols = ['default_profile', 'default_profile_image', 'verified']

for col in bool_cols:
    X_train[col] = X_train[col].astype(str).str.strip().str.lower().eq('true')
    X_test[col]  = X_test[col].astype(str).str.strip().str.lower().eq('true')

# ── Train ─────────────────────────────────────────────────────────────────────
model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',  # gère le déséquilibre bots/humains
    random_state=42
)
model.fit(X_train2, y_train2)
y_pred2 = model.predict(X_test2)
print("=== Classification Report ===")
print(classification_report(y_test2, y_pred2, target_names=['Human', 'Bot']))

print("=== Confusion Matrix ===")
cm = confusion_matrix(y_test2, y_pred2)
print(pd.DataFrame(cm,
    index=['Actual Human', 'Actual Bot'],
    columns=['Predicted Human', 'Predicted Bot']))

print("\n=== Feature Importance ===")
importance_df = pd.DataFrame({
    'feature'   : features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(importance_df.to_string(index=False))