In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Load datasets
df_bots_tweets = pd.read_csv('social_honeypot_icwsm_2011/content_polluters_tweets.txt', 
                            sep='\t', 
                            header=None,
                            names=["UserID", "TweetID", "Tweet", "CreatedAt"])

df_humans_tweets = pd.read_csv('social_honeypot_icwsm_2011/legitimate_users_tweets.txt', 
                              sep='\t', 
                              header=None,
                              names=["UserID", "TweetID", "Tweet", "CreatedAt"])

df_bots_tweets['is_bot'] = 1
df_humans_tweets['is_bot'] = 0

df_combined = pd.concat([df_bots_tweets, df_humans_tweets], axis=0, ignore_index=True)

df_combined = df_combined.dropna(subset=['Tweet'])
df_combined['Tweet'] = df_combined['Tweet'].astype(str)
df_combined = df_combined[df_combined['Tweet'].str.strip() != '']

# First, create train/test split while preserving user groups
group_split = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [3]:
X = df_combined['Tweet']
y = df_combined['is_bot']
groups = df_combined['UserID']

# Get train/test indices
train_idx, test_idx = next(group_split.split(X, y, groups=groups))

X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]
groups_train = groups.iloc[train_idx]

In [4]:
print(f"Training set size: {len(X_train)} tweets")
print(f"Test set size: {len(X_test)} tweets")
print(f"Unique users in training: {len(groups_train.unique())}")
print(f"Unique users in test: {len(groups.iloc[test_idx].unique())}\n")

Training set size: 4473428 tweets
Test set size: 1106627 tweets
Unique users in training: 31882
Unique users in test: 7971



In [5]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(
        max_features=1000,
        ngram_range=(1, 2),
        stop_words='english',
        strip_accents='unicode',
        min_df=2  # Ignore terms that appear in less than 2 documents
    )),
    ('classifier', LogisticRegression(
        class_weight='balanced',
        random_state=42,
        max_iter=200  # Increase max iterations for convergence
    ))
])

# Cross-validation on training data
group_kfold = GroupKFold(n_splits=5)
fold_scores = []

In [6]:
for fold, (cv_train_idx, cv_val_idx) in enumerate(group_kfold.split(X_train, y_train, groups=groups_train), 1):
    # Get fold data
    X_fold_train = X_train.iloc[cv_train_idx]
    y_fold_train = y_train.iloc[cv_train_idx]
    X_fold_val = X_train.iloc[cv_val_idx]
    y_fold_val = y_train.iloc[cv_val_idx]
    
    # Train the model
    pipeline.fit(X_fold_train, y_fold_train)
    
    # Make predictions
    predictions = pipeline.predict(X_fold_val)
    
    # Print fold results
    print(f"\nFold {fold} Results:")
    print(classification_report(y_fold_val, predictions))
    
    # Store fold score
    fold_scores.append(pipeline.score(X_fold_val, y_fold_val))

print(f"\nAverage CV accuracy: {np.mean(fold_scores):.3f} (±{np.std(fold_scores):.3f})")

print("\nTraining final model on all training data...")
pipeline.fit(X_train, y_train)

# Evaluate on test set
print("\nFinal Test Set Performance:")
test_predictions = pipeline.predict(X_test)
print(classification_report(y_test, test_predictions))


Fold 1 Results:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82    518998
           1       0.74      0.77      0.75    375688

    accuracy                           0.79    894686
   macro avg       0.78      0.79      0.79    894686
weighted avg       0.79      0.79      0.79    894686


Fold 2 Results:
              precision    recall  f1-score   support

           0       0.82      0.81      0.81    523577
           1       0.73      0.76      0.75    371109

    accuracy                           0.79    894686
   macro avg       0.78      0.78      0.78    894686
weighted avg       0.79      0.79      0.79    894686


Fold 3 Results:
              precision    recall  f1-score   support

           0       0.82      0.79      0.80    519098
           1       0.72      0.77      0.74    375588

    accuracy                           0.78    894686
   macro avg       0.77      0.78      0.77    894686
weighted avg       0.78

In [7]:
def predict_tweet(tweet_text, trained_pipeline):
    """
    Predict whether a tweet is from a bot or human.
    
    Args:
        tweet_text (str): The text of the tweet to classify
        trained_pipeline: Trained sklearn pipeline
    
    Returns:
        int: 1 for bot, 0 for human
    """
    return trained_pipeline.predict([tweet_text])[0]

# Example predictions
print("\nExample predictions:")
example_tweets = [
    "Click here to win a free iPhone! Limited time offer!",
    "Just had a great coffee with friends at the local cafe.",
]


Example predictions:


In [8]:
for tweet in example_tweets:
    prediction = predict_tweet(tweet, pipeline)
    print(f"\nTweet: {tweet}")
    print(f"Prediction: {'Bot' if prediction == 1 else 'Human'}")


Tweet: Click here to win a free iPhone! Limited time offer!
Prediction: Bot

Tweet: Just had a great coffee with friends at the local cafe.
Prediction: Bot


In [9]:
import joblib
joblib.dump(pipeline, 'bot_classifier.joblib')

['bot_classifier.joblib']

In [12]:
loaded_model = joblib.load('bot_classifier.joblib')

tweets = ["I actually liked that Keanu Reeves voiced Shadow in new sonic movie!", "Had lunch with friends", "Free Palestine!"]
predictions = loaded_model.predict(tweets)
for tweet, pred in zip(tweets, predictions):
    print(f"Tweet: {tweet}")
    print(f"Prediction: {'Bot' if pred == 1 else 'Human'}\n")

Tweet: I actually liked that Keanu Reeves voiced Shadow in new sonic movie!
Prediction: Human

Tweet: Had lunch with friends
Prediction: Human

Tweet: Free Palestine!
Prediction: Bot

