In [None]:
import pandas as pd
from analysis_helpers import load_url_data, analyze_authors_comprehensive, add_domain_column
import nest_asyncio

nest_asyncio.apply()


ALL_USERS = 'url_stream.csv'
LABELED_USERS = 'train_data.csv'

df = load_url_data(ALL_USERS)

def to_did(url):
    return url.split('/')[-1]

labeled = pd.read_csv(LABELED_USERS)
labeled['author'] = labeled['link'].apply(to_did)

df = add_domain_column(df)

author_stats = analyze_authors_comprehensive(df, labels_df=labeled)
author_stats = author_stats[author_stats['label'].notnull()]

In [None]:
from analysis_helpers import populate_follower_count

author_stats = populate_follower_count(author_stats)

In [None]:
author_stats.head()

In [None]:
target_column = 'label'
# feature_columns = ['unique_domains', 'unique_urls', 'avg_time_between_posts', 'followers_count', 'follows_count', 'follower_following_ratio']
feature_columns = ['unique_domains', 'unique_urls', 'followers_count', 'follows_count']

In [None]:
from analysis_helpers import augment_data

test_data = augment_data(author_stats, feature_columns, target_column, num_synthetic_rows=150)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Create a copy to preserve original data for the scaler
test_data_original = test_data.copy()

# Fit scaler on original data (BEFORE transforming)
scaler = StandardScaler()
scaler.fit(test_data_original[feature_columns])

# Transform the data
test_data[feature_columns] = scaler.transform(test_data[feature_columns])

# Prepare features and target
X = test_data[feature_columns]
y = test_data[target_column]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = DecisionTreeClassifier(
    random_state=42,
    max_depth=2,
    min_samples_split=5,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced'
)
classifier.fit(X_train, y_train)

y_proba = classifier.predict_proba(X_test)

def report_at_threshold(threshold: float = 0.5):
    """Higher threshold -> higher precision"""

    # Get class labels
    classes = classifier.classes_
    spam_idx = list(classes).index('spam')

    y_pred_high_precision = np.where(y_proba[:, spam_idx] >= threshold, 'spam', 'good')
    
    precision = precision_score(y_test, y_pred_high_precision, pos_label='spam', zero_division=1)
    
    recall = recall_score(y_test, y_pred_high_precision, pos_label='spam', zero_division=1)

    return precision, recall

In [None]:
from sklearn.metrics import classification_report

# Get predictions at the chosen threshold
THRESHOLD = 0.6
spam_idx = list(classifier.classes_).index('spam')
y_pred_threshold = np.where(y_proba[:, spam_idx] >= THRESHOLD, 'spam', 'good')

print(f"Classification Report (threshold = {THRESHOLD})")
print("=" * 50)
print(classification_report(y_test, y_pred_threshold, target_names=['good', 'spam']))

## Quick digression for logistic comparison

In [None]:
## Logistic Regression Model
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
lr_classifier.fit(X_train, y_train)
y_pred_lr = lr_classifier.predict(X_test)

In [None]:
# Plot decision boundaries for Logistic Regression
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import LabelEncoder
from itertools import combinations

# Encode labels to numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Get all pairwise combinations of features
feature_pairs = list(combinations(range(len(feature_columns)), 2))

n_plots = len(feature_pairs)
n_cols = 3
n_rows = (n_plots + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
axes = axes.flatten() if n_plots > 1 else [axes]

for idx, (i, j) in enumerate(feature_pairs):
    X_pair = X.iloc[:, [i, j]].values
    
    # Train logistic regression on this feature pair
    lr_pair = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
    lr_pair.fit(X_pair, y_encoded)
    
    # Create mesh grid
    feature_1, feature_2 = np.meshgrid(
        np.linspace(X_pair[:, 0].min() - 1, X_pair[:, 0].max() + 1, 100),
        np.linspace(X_pair[:, 1].min() - 1, X_pair[:, 1].max() + 1, 100)
    )
    grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
    y_grid = lr_pair.predict(grid).reshape(feature_1.shape)
    
    # Plot decision boundary
    display = DecisionBoundaryDisplay(xx0=feature_1, xx1=feature_2, response=y_grid)
    display.plot(ax=axes[idx], alpha=0.4, cmap='RdYlGn')
    
    # Scatter actual points
    scatter = axes[idx].scatter(X_pair[:, 0], X_pair[:, 1], c=y_encoded, 
                                 cmap='RdYlGn', edgecolor='black', s=50, alpha=0.7)
    
    axes[idx].set_xlabel(feature_columns[i])
    axes[idx].set_ylabel(feature_columns[j])
    axes[idx].set_title(f'{feature_columns[i]} vs {feature_columns[j]}')

# Remove extra subplots
for idx in range(n_plots, len(axes)):
    fig.delaxes(axes[idx])

fig.suptitle('Logistic Regression Decision Boundaries', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## Report scores

In [None]:
## Plot precision/recall for 'spam' label at different thresholds
import matplotlib.pyplot as plt
from scipy.interpolate import PchipInterpolator

# Get class labels
classes = classifier.classes_
spam_idx = list(classes).index('spam')

# Test different thresholds (more granular)
thresholds = np.arange(0.05, 1.0, 0.02)
precisions = []
recalls = []

for threshold in thresholds:
    y_pred_thresh = np.where(y_proba[:, spam_idx] >= threshold, 'spam', 'good')
    
    precision, recall = report_at_threshold(threshold)
    
    precisions.append(precision)
    recalls.append(recall)

precisions = np.array(precisions)
recalls = np.array(recalls)

# Use PCHIP interpolation (monotonic, no overshoot)
thresholds_smooth = np.linspace(thresholds.min(), thresholds.max(), 300)

precision_interp = PchipInterpolator(thresholds, precisions)
recall_interp = PchipInterpolator(thresholds, recalls)

precisions_smooth = precision_interp(thresholds_smooth)
recalls_smooth = recall_interp(thresholds_smooth)

# Clip values to valid range [0, 1]
precisions_smooth = np.clip(precisions_smooth, 0, 1)
recalls_smooth = np.clip(recalls_smooth, 0, 1)

# Plot precision and recall vs threshold
fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.plot(thresholds_smooth, precisions_smooth, 'b-', linewidth=2, label='Precision')
ax1.plot(thresholds_smooth, recalls_smooth, 'r-', linewidth=2, label='Recall')

# Add original points as markers
ax1.scatter(thresholds, precisions, color='blue', s=30, alpha=0.5, zorder=5)
ax1.scatter(thresholds, recalls, color='red', s=30, alpha=0.5, zorder=5)

ax1.set_xlabel('Threshold', fontsize=12)
ax1.set_ylabel('Score', fontsize=12)
ax1.set_title('Precision vs Recall at Different Thresholds (Spam Class)', fontsize=14)
ax1.legend(loc='center right')
ax1.grid(True, alpha=0.3)
ax1.set_xlim(0.05, 0.95)
ax1.set_ylim(0, 1.05)

# Mark default threshold (0.5)
ax1.axvline(x=0.5, color='gray', linestyle='--', alpha=0.7, label='Default (0.5)')

# Find and mark the threshold where precision = recall (F1 optimal point)
diff = np.abs(precisions_smooth - recalls_smooth)
optimal_idx = np.argmin(diff)
ax1.scatter([thresholds_smooth[optimal_idx]], [precisions_smooth[optimal_idx]], 
            color='green', s=100, zorder=5, label=f'P=R @ {thresholds_smooth[optimal_idx]:.2f}')

plt.tight_layout()
plt.show()

In [None]:
from sklearn.tree import export_graphviz
import graphviz

# Export tree to DOT format
dot_data = export_graphviz(
    classifier,
    feature_names=feature_columns,
    class_names=['Not Spam', 'Spam'],
    filled=True,
    rounded=True,
    impurity=False,
    proportion=True,
    special_characters=True
)

# Replace default colors with custom colors (blue for Not Spam, orange for Spam)
# Class 0 (Not Spam) -> blue, Class 1 (Spam) -> orange
dot_data = dot_data.replace('fillcolor="#', 'fillcolor="#temp')

# Create custom color mapping based on class
lines = dot_data.split('\n')
new_lines = []
for line in lines:
    if 'class = Not Spam' in line or "value = [1.0, 0.0]" in line:
        line = line.replace('fillcolor="#temp', 'fillcolor="#3498db')
    elif 'class = Spam' in line:
        line = line.replace('fillcolor="#temp', 'fillcolor="#e67e22')
    new_lines.append(line)

dot_data = '\n'.join(new_lines)

# Display the tree
graph = graphviz.Source(dot_data)
graph

## Save the Classifier

In [None]:
# Save the classifier and threshold for inference
import joblib
from pathlib import Path

# Create classifier directory if it doesn't exist
Path('classifier').mkdir(exist_ok=True)

# Save the classifier
joblib.dump(classifier, 'classifier/spam_classifier.joblib')

# Save the scaler (needed to normalize new data)
joblib.dump(scaler, 'classifier/feature_scaler.joblib')

# Save configuration
config = {
    'threshold': THRESHOLD,
    'feature_columns': feature_columns,
    'spam_class_index': list(classifier.classes_).index('spam'),
    'classes': list(classifier.classes_)
}

joblib.dump(config, 'classifier/classifier_config.joblib')