# Reddit Bot Commenters <a class="tocSkip">
Identifies likely bot commenters on Reddit using Benford's Law. See [original blog post](https://diybigdata.net/2020/03/using-benfords-law-to-identify-bots-on-reddit/) for a discussion on this technique.

The core of this code is the `generateBenfordsLawAnalysis()` function, which takes a user event log data frame that must have a user ID column and a event timestamp column, and it returns the chi squared score of close each user's activity is to the ideal Benford's Law distribution. Scores closer to zero mean the user's activity more closely adheres to the ideal distribution. 

In [None]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window as W

import pandas as pd

pd.set_option('display.max_colwidth', None)

spark = SparkSession\
        .builder\
        .appName("RedditBotCommenters")\
        .getOrCreate()

In [None]:
orig_suffle_partitions = spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.shuffle.partitions", 500)

In [None]:
reddit_df = (
    spark.read.parquet('qfs:///data/reddit/comments/processed')
    # filter out moderator and deleted authors
    .filter(~F.col('author').isin('[deleted]','AutoModerator'))
)

reddit_df.printSchema()

In [None]:
submissions_df = spark.read.parquet('qfs:///data/reddit/submissions/processed')
submissions_df.printSchema()

In [None]:
combined_df = (
    reddit_df
    .select(
        'author',
        'created_utc',
    )
    .union(
        submissions_df
        .select(
            'author',
            'created_utc',
        )
    
    )
    .filter(
        F.col('author').isNotNull()
        &(F.length(F.col('author')) > 0)
    )
    .repartition('author')
)

`generateBenfordsLawAnalysis`

A function to perform Benford's Law analysis against a data frame of user activities in order to determine which user's activities best (or least) adhere to the Benford's Law distribution. The data frame is ostensibly a event log keyed by a user ID and has a timestamp for each event row. Only the user ID and timesamps columns are used for analysis.

### Arguments <a class="tocSkip">
* `df` - The data frame with the timestamped user activity to be analyzed
* `user_col` - a string identifying the name of the column of df that contains the user IDs
* `timestamp_col` - a string identifying the name of the column of df that contains the event timestamps. Must be `T.LongType()`.
* `event_threshold` - the minimum number of events a user must have for the Benford's Law analysis to performed on it. Defaults to 100.

### Returns <a class="tocSkip">
A dataframe with the following columns:
* `user_col` - The user IDs. The column name will be the same as the original dataframe.
* `frequency_count` - the number of events found for the user
* `chi_squared` - the chi squared score indicating how similar the user's activity is to the ideal Benford's Law distribution.
* `digit_share` - A list containing the relative share each first digit has among the user's activity. The list is ordered from digit 1 to digit 9.



In [None]:
from math import log10, sqrt

def _getUsersAndDigit(df, user_col, event_threshold):
    digits_df = (
        spark
        .createDataFrame(
            [[1], [2], [3], [4], [5], [6], [7], [8], [9]],
            schema=T.StructType([
                T.StructField(
                    "first_digit", 
                    T.IntegerType()
                )
            ])
        )
        .coalesce(1)
    )
    users_and_digits = (
        df
        .groupBy(user_col)
        .agg(F.count('*').alias('count'))
        .filter(F.col('count') > event_threshold )
        .select(user_col)
        .repartition(user_col)
        .crossJoin(digits_df)
    )
    return users_and_digits

def _generateFirstDigitShare(df, user_col, timestamp_col):
    user_event_window = W.partitionBy(user_col).orderBy(timestamp_col)
    user_cum_dist_window = W.partitionBy(user_col).orderBy('first_digit')
    
    event_time_delta = F.col(timestamp_col) - F.lag(F.col(timestamp_col)).over(user_event_window)

    first_digit_share = (
        df
        .select(
            user_col,
            timestamp_col,
            event_time_delta.alias('time_delta')
        )
        .filter(F.col('time_delta').isNotNull())
        .withColumn(
            'first_digit',
            F.substring(F.col('time_delta').cast(T.StringType()), 0, 1).cast(T.IntegerType())
        )
        .withColumn(
            'first_digit_cum_dist',
            F.cume_dist().over(user_cum_dist_window)
        )
        .groupBy(user_col, 'first_digit', 'first_digit_cum_dist')
        .agg(
            F.count(timestamp_col).alias('frequency_count')
        )
        .withColumn(
            'first_digit_share',
            F.col('first_digit_cum_dist') 
                - F.coalesce(
                    F.lag('first_digit_cum_dist').over(user_cum_dist_window), 
                    F.lit(0)
                )
        )
        .repartition(user_col)
    )
    return first_digit_share

def _expectedBenfordsShare():
    digits = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    expected_share_list = [(d, log10(d+1)-log10(d)) for d in digits]

    expected_share_df = (
        spark
        .createDataFrame(
            expected_share_list,
            schema=T.StructType([
                T.StructField(
                    'first_digit', 
                    T.IntegerType()
                ),
                T.StructField(
                    'expected_share',
                    T.DoubleType()
                )
            ])
        )
        .coalesce(1)
    )
    
    return expected_share_df

def generateBenfordsLawAnalysis(df, user_col, timestamp_col, event_threshold = 100):
    user_digts_df = _getUsersAndDigit(df, user_col, event_threshold)
    first_digit_share_df = _generateFirstDigitShare(df, user_col, timestamp_col)
    expected_share_df = _expectedBenfordsShare()
    
    finalized_first_digit_share_df = (
        first_digit_share_df
        .join(
            user_digts_df,
            on=[user_col,'first_digit'],
            how='right'
        )
        .na.fill(0)
        .cache()
    )    
    user_benford_distances = (
        finalized_first_digit_share_df
        .join(
            F.broadcast(expected_share_df),
            on='first_digit',
            how='inner'
        )
        .withColumn(
            'chi_squared_addends',
            F.pow(
                (F.col('first_digit_share') - F.col('expected_share')),
                F.lit(2)
            ) / F.col('expected_share')
        )
        .orderBy(user_col, 'first_digit')
        .groupBy(user_col)
        .agg(
            F.sum('frequency_count').alias('frequency_count'),
            F.sum('chi_squared_addends').alias('chi_squared'),
            F.collect_list(F.col('first_digit_share')).alias('digit_share')
        )
    )
    return user_benford_distances 

In [None]:
new_df = generateBenfordsLawAnalysis(reddit_df, 'author', 'created_utc')

new_df.orderBy(F.col('chi_squared').desc()).limit(50).toPandas()

In [None]:
new_df.write.parquet(
    'qfs:///user/spark/reddit/author_bot_chi_squared_score/',
    mode='overwrite'
)