In [None]:
import pandas as pd
import re

df = pd.read_csv("bot_detection_data.csv")

def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+", "", text)  # remove URLs
    text = re.sub(r"\s+", " ", text).strip()
    return text

final_df = pd.DataFrame({
    "Text": df["Tweet"].map(clean_text),
    "Bot Label": df["Bot Label"]
})

# remove empty rows
final_df = final_df[final_df["Text"].str.len() > 0]

# remove duplicates
final_df = final_df.drop_duplicates(subset=["Text"]).reset_index(drop=True)

print(final_df.shape)
final_df.head(10000)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc

In [None]:
# Create TF-IDF features from the preprocessed text
print("Creating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', min_df=2)

# Prepare text and labels
X_text = final_df["Text"].astype(str).tolist()
y_labels = final_df["Bot Label"].to_numpy()

# Split into train and test sets
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y_labels,
    test_size=0.2,
    random_state=42,
    stratify=y_labels
)

# Create TF-IDF vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test_text).toarray()

print(f"Training set: {len(X_train_text)} samples")
print(f"Test set: {len(X_test_text)} samples")
print(f"TF-IDF feature shape: {X_train_tfidf.shape}")
print(f"Each document is represented as a single vector of {X_train_tfidf.shape[1]} dimensions")

In [None]:
# Train Logistic Regression model with TF-IDF features
print("=" * 60)
print("TRAINING: Logistic Regression with TF-IDF features")
print("=" * 60)

# Create and train the model
tfidf_model = LogisticRegression(max_iter=1000, random_state=42)
tfidf_model.fit(X_train_tfidf, y_train)

print("Model trained successfully!")

In [None]:
# Evaluate the model and print classification report
target_names = ['Human', 'Bot']   

print("=" * 60)
print("CLASSIFICATION REPORT: Logistic Regression (TF-IDF)")
print("=" * 60)

# Make predictions
y_pred = tfidf_model.predict(X_test_tfidf)

# Print results
print(classification_report(y_test, y_pred, target_names=target_names))
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Confusion matrix
print("\n" + "=" * 60)
print("CONFUSION MATRIX:")
print("=" * 60)
print(confusion_matrix(y_test, y_pred))

In [None]:
# Visualize the results
from sklearn.metrics import precision_recall_fscore_support

# Calculate metrics for plotting
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
metrics = {
    'Human': [precision[0], recall[0], f1[0]],
    'Bot': [precision[1], recall[1], f1[1]]
}

# Create figure with subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# 1. Confusion Matrix Heatmap
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0], 
            xticklabels=['Human', 'Bot'], yticklabels=['Human', 'Bot'])
axes[0].set_title('Confusion Matrix', fontsize=14, fontweight='bold')
axes[0].set_ylabel('True Label', fontsize=12)
axes[0].set_xlabel('Predicted Label', fontsize=12)

# 2. Metrics Comparison Bar Chart
x = np.arange(len(['Precision', 'Recall', 'F1-Score']))
width = 0.35
axes[1].bar(x - width/2, metrics['Human'], width, label='Human', alpha=0.8, color='skyblue')
axes[1].bar(x + width/2, metrics['Bot'], width, label='Bot', alpha=0.8, color='coral')
axes[1].set_xlabel('Metric', fontsize=12)
axes[1].set_ylabel('Score', fontsize=12)
axes[1].set_title('Classification Metrics by Class', fontsize=14, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(['Precision', 'Recall', 'F1-Score'])
axes[1].legend()
axes[1].set_ylim([0, 1.1])
axes[1].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, (h_val, b_val) in enumerate(zip(metrics['Human'], metrics['Bot'])):
    axes[1].text(i - width/2, h_val + 0.02, f'{h_val:.3f}', ha='center', va='bottom', fontsize=9)
    axes[1].text(i + width/2, b_val + 0.02, f'{b_val:.3f}', ha='center', va='bottom', fontsize=9)

# 3. ROC Curve
y_pred_proba = tfidf_model.predict_proba(X_test_tfidf)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
axes[2].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
axes[2].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
axes[2].set_xlim([0.0, 1.0])
axes[2].set_ylim([0.0, 1.05])
axes[2].set_xlabel('False Positive Rate', fontsize=12)
axes[2].set_ylabel('True Positive Rate', fontsize=12)
axes[2].set_title('ROC Curve', fontsize=14, fontweight='bold')
axes[2].legend(loc="lower right")
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n" + "=" * 60)
print("SUMMARY STATISTICS")
print("=" * 60)
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"\nHuman - Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1: {f1[0]:.4f}")
print(f"Bot   - Precision: {precision[1]:.4f}, Recall: {recall[1]:.4f}, F1: {f1[1]:.4f}")