In [1]:
from datasketch.hnsw import HNSW
from datasketch import MinHash
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
users_timestamps = {}
chunks = pd.read_csv("../data.csv", chunksize=400_000)
for i, chunk in enumerate(chunks):
	# process 10% of the data
	if i % 100 >= 10:
		continue
	else:
		print(f"Processing chunk {i}")

	chunk['timestamp'] = pd.to_datetime(chunk['timestamp'], format='mixed')
	for user_id, group in chunk.groupby('user_id'):
		if user_id not in users_timestamps:
			users_timestamps[user_id] = []
		for _, row in group.iterrows():
			users_timestamps[user_id].append(row['timestamp'].value)

In [11]:
import numpy as np

def detect_bots(user_activity, cv_threshold=0.05, min_intervals=5):
    """
    Identify users with regular interaction intervals (potential bots).
    
    Parameters:
    - user_activity: Dict {user: list_of_timestamps}
    - cv_threshold: Threshold for coefficient of variation (lower = more strict)
    - min_intervals: Minimum number of intervals required for analysis
    
    Returns:
    - List of users flagged as potential bots
    """
    bot_users = []
    
    for user, timestamps in user_activity.items():
        # Skip users with fewer than 2 timestamps (no intervals)
        if len(timestamps) < 2:
            continue
        
        # Convert to Unix timestamps if not already numerical
        sorted_ts = sorted(timestamps)
        intervals = np.diff(sorted_ts)  # Compute time differences
        
        # Skip users with insufficient intervals for analysis
        if len(intervals) < min_intervals:
            continue
        
        mean_interval = np.mean(intervals)
        
        # Avoid division by zero (possible if all intervals are 0)
        if mean_interval == 0:
            bot_users.append(user)
            continue
        
        cv = np.std(intervals) / mean_interval  # Coefficient of variation
        
        if cv < cv_threshold:
            bot_users.append(user)
    
    return bot_users

In [None]:
bots = detect_bots(users_timestamps, cv_threshold=0.01, min_intervals=5)
print(f"Detected {len(bots)} bots")

In [48]:
def get_pixel_placement(user_ids):
    # Deduplicate user_ids and initialize dictionary with empty lists
    unique_users = list(set(user_ids))
    user_data = {uid: {"coordinates": [], "colors": []} for uid in unique_users}
    
    # Process data in chunks
    for i, chunk in enumerate(pd.read_csv("../data.csv", chunksize=10_000)):
        if i % 100 == 0:
            print(f"Processing chunk {i}")
            
        # Filter rows and group by user_id
        filtered = chunk[chunk['user_id'].isin(unique_users)]
        grouped = filtered.groupby('user_id')
        
        # Aggregate results per user
        for user_id, group in grouped:
            user_data[user_id]["coordinates"].extend(group['coordinate'].tolist())
            user_data[user_id]["colors"].extend(group['pixel_color'].tolist())
    
    return user_data

In [None]:
user_data = get_pixel_placement(bots)

In [50]:
def bucket_coordinates(coord, grid_size=100):
    coord = coord.split(",")
    return (int(coord[0]) // grid_size, int(coord[1]) // grid_size)

In [None]:
user_minhashes = {}

for user_id in user_data.keys():
	minhash = MinHash(num_perm=128)
	for coord, color in zip(user_data[user_id]["coordinates"], user_data[user_id]["colors"]):
		combined_feature = f"{bucket_coordinates(coord)}_{color}"
		minhash.update(combined_feature.encode('utf-8'))
	user_minhashes[user_id] = minhash

jaccard_distance = lambda x, y: (
    1 - x.jaccard(y)
)
index = HNSW(distance_func=jaccard_distance)
user_ids = list(user_minhashes.keys())
for i in range(len(user_ids)):
    index.insert(user_ids[i], user_minhashes[user_ids[i]])

In [None]:
def find_similar_users(user_id, top_n=5, threshold=0.5):
	query = user_minhashes[user_id]
	result = index.query(query, top_n)
	result = [user for user in result if user[1] < threshold]
	# check if the user is in the result and remove it
	if (user_id, 0) in result:
		result.remove((user_id, 0))
	return result

In [None]:
max_similar_users = 0
max_user = None
for bot in bots:
	similar_users = find_similar_users(bot, top_n=100, threshold=0.5)
	if len(similar_users) > max_similar_users:
		max_similar_users = len(similar_users)
		max_user = bot
print(max_similar_users, max_user)

In [60]:
def plot_coords(coords, colors=None, width=2000, height=2000, fig=None, ax=None):
    if colors is None:
        colors = ["blue"] * len(coords)
    
    # Create new figure if none provided
    if fig is None or ax is None:
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.set_xlim(0, width)
        ax.set_ylim(0, height)
        # ax.invert_yaxis()
        ax.set_xlabel("X")
        ax.set_ylabel("Y")
        ax.set_title("Pixel Coordinates Plot")
        ax.grid(True, linestyle='--', alpha=0.4)

    # Plot all points
    for coord, color in zip(coords, colors):
        x_str, y_str = coord.split(',')
        x, y = int(x_str), int(y_str)
        ax.plot(x, y, marker='o', markersize=5, color=color)
    return fig, ax

In [None]:
fig, ax = None, None

for similar_users in find_similar_users(max_user, top_n=100, threshold=0.9):
	user_id, _ = similar_users
	coords = user_data[user_id]['coordinates']
	colors = user_data[user_id]['colors']
	fig, ax = plot_coords(coords, colors, fig=fig, ax=ax)

# Add legend and show when done
plt.gca().invert_yaxis()
plt.show()
