In [1]:
import pandas as pd
from analysis_helpers import load_url_data, analyze_authors_comprehensive, add_domain_column
import nest_asyncio

nest_asyncio.apply()


ALL_USERS = 'url_stream.csv'
LABELED_USERS = 'test_data.csv'

df = load_url_data(ALL_USERS)

def to_did(url):
    return url.split('/')[-1]

labeled = pd.read_csv(LABELED_USERS)
labeled['author'] = labeled['link'].apply(to_did)

df = add_domain_column(df)
author_stats = analyze_authors_comprehensive(df, labels_df=labeled)

test_data = author_stats[author_stats['label'].notnull()]


The 'default' attribute with value None was provided to the `Field()` function, which has no effect in the context it was used. 'default' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.



In [2]:
from analysis_helpers import populate_follower_count

test_data = populate_follower_count(test_data)

In [3]:
test_data.head()

Unnamed: 0,author,label,total_posts,unique_domains,unique_urls,first_post,last_post,top_domain,top_domain_count,domain_share,duration,duration_seconds,posts_per_minute,avg_time_between_posts,posts_to_bursty_urls,followers_count,follows_count,follower_following_ratio
0,did:plc:uld74vzf773y7ovqqm2jfaft,good,288,1,288,2025-11-21 00:43:36+00:00,2025-11-22 04:28:52+00:00,kripta.biz,288,1.0,1 days 03:45:16,99916.0,0.172945,348.139373,0,111.0,0.1,1110.0
1,did:plc:msian4dqa2rqalf3biilnf3m,good,285,1,237,2025-11-21 01:00:08+00:00,2025-11-22 04:01:07+00:00,europesays.com,285,1.0,1 days 03:00:59,97259.0,0.175819,342.461268,0,2983.0,0.1,29830.0
2,did:plc:ni6cl7jipinqldoyowqlbrwp,good,152,1,152,2025-11-21 01:30:05+00:00,2025-11-23 22:20:31+00:00,newsbeep.com,152,1.0,2 days 20:50:26,247826.0,0.0368,1641.231788,0,105.0,6.0,17.5
3,did:plc:uentzwq4lz5mkfa6ffu2lodl,good,80,1,80,2025-11-21 00:53:31.589000+00:00,2025-11-21 03:22:10.661000+00:00,youtube.com,80,1.0,0 days 02:28:39.072000,8919.072,0.538169,112.899646,0,1180.0,331.0,3.564955
4,did:plc:btb6d463sylf5k5swmejfvl7,bad,59,2,53,2025-11-21 01:00:06+00:00,2025-11-22 03:54:04+00:00,amazon.com.br,46,0.779661,1 days 02:53:58,96838.0,0.036556,1669.62069,3,11.0,38.0,0.289474


In [4]:
target_column = 'label'
feature_columns = ['unique_domains', 'unique_urls', 'avg_time_between_posts', 'posts_to_bursty_urls', 'followers_count', 'follows_count', 'follower_following_ratio']

In [5]:
from analysis_helpers import augment_data

augmented_data = augment_data(test_data, feature_columns, target_column, num_synthetic_rows=150)


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Quick normalization
scaler = StandardScaler()
test_data[feature_columns] = scaler.fit_transform(test_data[feature_columns])

# Prepare features and target
X = test_data[feature_columns]
y = test_data[target_column]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create and train the classifier
rf_classifier = RandomForestClassifier(random_state=42, max_depth=5)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         bad       0.80      0.67      0.73         6
        good       0.86      0.92      0.89        13

    accuracy                           0.84        19
   macro avg       0.83      0.79      0.81        19
weighted avg       0.84      0.84      0.84        19



In [12]:
# Report feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': dt_classifier.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance Rankings:")
print("="*50)
for _, row in feature_importance.iterrows():
    print(f"{row['feature']}: {row['importance']:.4f}")

Feature Importance Rankings:
follows_count: 0.3919
followers_count: 0.2970
avg_time_between_posts: 0.1818
unique_domains: 0.0727
unique_urls: 0.0566
posts_to_bursty_urls: 0.0000
follower_following_ratio: 0.0000
