In [1]:
import random
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# List of top 25 subreddit names to read files
subredditNames = ["AmItheAsshole", "AskReddit", "Damnthatsinteresting", "DestinyTheGame", 
                  "Home", "LivestreamFail", "NoStupidQuestions", "PublicFreakout", "Unexpected", 
                  "WhitePeopleTwitter", "antiwork", "diablo4", "explainlikeimfive", "facepalm", 
                  "funny", "gaming", "interestingasfuck", "leagueoflegends", "mildlyinfuriating", 
                  "movies", "pcmasterrace", "pics", "therewasanattempt", "videos", "worldnews"]

In [3]:
# Updates the true count of tokens in subreddit
def UpdateTrueCount(trueCount, token):
    if token in trueCount:
        trueCount[token] += 1
    else:
        trueCount[token] = 1

In [4]:
def FalsePositiveRate(buffer, trueCount, m, k):
    fpr = 0
    for key in buffer.keys():
        if trueCount[key] < (m / k):
            fpr += 1
        
    return fpr / len(buffer)

In [9]:
# Prints out the first n tokens with the highest count in a subreddit
def nHighestCount(trueCount, n):
    # Sort the trueCount
    sortedDict = {k: v for k, v in sorted(trueCount.items(), key=lambda item: item[1], reverse=True)}
    
    # Print first n values
    print({k: sortedDict[k] for k in list(sortedDict)[:n]})

In [5]:
# Basic Misra-Gries algorithm
def MisraGries(token, buffer, k): 
    if token in buffer:
        buffer[token] = buffer[token] + 1
    elif len(buffer) < k-1:
        buffer[token] = 1
    else:
        keys = list(buffer.keys())
        for key in keys:
            buffer[key] -= 1
            if buffer[key] == 0:
                del buffer[key]

In [6]:
# Needs uncommenting if not downloaded
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('wordnet')

wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
stop.append("like")
stop.append("get")

# Checks if a word is a common word, url, and lemmatizes the word
def CommonWord(word):
    isCommonWord = False

    if len(word) <= 1 or word in stop: isCommonWord = True
    elif word.startswith('https://') or word.startswith('http://'): isCommonWord = True

    # Performs Lemmatization
    word = wordnet_lemmatizer.lemmatize(word)
    
    return word, isCommonWord

In [7]:
# Buffer information for all subreddits combined
fullBuffer = {}
fullCount = {}
fullK = 801
fullM = 0

# Perform Misra-Gries on each subreddit separately
for subreddit in subredditNames:
    print(subreddit)
    with open('Reddit_Comments/' + subreddit + '.txt') as f:
        buffer = {}
        trueCount = {}
        k = 801 # Buffer size of 800 chosen after testing false-positive rates
        m = 0
        for line in f:
            tokens = line.split()
            random.shuffle(tokens) # Randomizes the tokens for each reply
            for token in tokens:
                token, isCommonWord = CommonWord(token)
                if isCommonWord: continue # Don't run through algorithm if word is "common"
                
                m += 1
                MisraGries(token, buffer, k)
                
                fullM += 1
                MisraGries(token, fullBuffer, fullK)
                
                UpdateTrueCount(trueCount, token)
                UpdateTrueCount(fullCount, token)
            
#     print(buffer)
#     nHighestCount(trueCount, 10)
    print("False Positive Rate: ", FalsePositiveRate(buffer, trueCount, m, k), '\n')

AmItheAsshole
False Positive Rate:  0.7090909090909091 

AskReddit
False Positive Rate:  0.8105436573311368 

Damnthatsinteresting
False Positive Rate:  0.8230088495575221 

DestinyTheGame
False Positive Rate:  0.7767722473604827 

Home
False Positive Rate:  0.8127544097693351 

LivestreamFail
False Positive Rate:  0.6882217090069284 

NoStupidQuestions
False Positive Rate:  0.7216035634743875 

PublicFreakout
False Positive Rate:  0.7712177121771218 

Unexpected
False Positive Rate:  0.8215712383488681 

WhitePeopleTwitter
False Positive Rate:  0.7218390804597701 

antiwork
False Positive Rate:  0.7293577981651376 

diablo4
False Positive Rate:  0.7670250896057348 

explainlikeimfive
False Positive Rate:  0.8360655737704918 

facepalm
False Positive Rate:  0.6588235294117647 

funny
False Positive Rate:  0.6917808219178082 

gaming
False Positive Rate:  0.779467680608365 

interestingasfuck
False Positive Rate:  0.7711538461538462 

leagueoflegends
False Positive Rate:  0.836998706338

In [10]:
# Print out the FPR and top 25 highest used words across all 25 subreddits
print("False Positive Rate: ", FalsePositiveRate(fullBuffer, fullCount, fullM, fullK), '\n')
nHighestCount(fullCount, 25)

False Positive Rate:  0.7670588235294118 

{'people': 9077, 'one': 8795, 'would': 8517, 'time': 7365, 'make': 5857, 'think': 5836, 'game': 5663, 'know': 5379, 'even': 5345, 'thing': 5261, 'want': 4798, 'good': 4769, 'need': 4705, 'go': 4628, 'year': 4605, 'also': 4472, 'really': 4361, 'way': 4350, 'see': 4332, 'look': 3915, 'still': 3879, 'much': 3873, 'work': 3817, 'could': 3787, 'going': 3735}
