In [21]:
#!/usr/bin/env python3
from datetime import datetime as dt
import csv
import time
import json
import emoji
import re
import urllib.request
from emoji.unicode_codes import UNICODE_EMOJI
from collections import Counter
import timeit

start_time = time.time()

print("hello")
    
def extract_emojis(a_string):
    """Finds all emojis in a string and converts them to unicode

    Parameters
    ----------
    a_string: str
        string that is searched for emojis
    
    Returns
    -------
    emojiTextArray: list
        contains unicode for each emoji found in a_string
    """
    emojiTextArray = []
    for c in a_string:
        if c in emoji.UNICODE_EMOJI:
            emojiTextArray.append(UNICODE_EMOJI[c])
    return emojiTextArray

def extract_profanity(a_string):
    """Finds all emojis in a string and converts them to unicode

    Parameters
    ----------
    a_string: str
        string that is searched for profanity
    
    Returns
    -------
    profanityArray: list
        contains counts for each word found in swearWords
    """
    #convert string to lower for consistency
    a_string = a_string.lower()
    #list of profanity to search string for
    swearWords = ['fuck', 'shit', 'bitch', 'dick']
    profanityArray = []
    for swear in swearWords:
        count = a_string.count(swear)
        profanityArray.append(count)
    return profanityArray


def extract_commentData(comments, limit, subreddit):
    """Extracts comment data from a subreddit containing score, number of emojis, individual and sum profanity counts, total emoji count, and total comments searched
    Ensures even ratio of comments containing emojis, profanity, and neither. Receives data from PushShift query extract_all_commentData (max 1000 comments).
    
    Parameters
    ----------
    comments: list
        list containing a dictionary for each comment with all data returned by PushShift relating to that comment
    limit: int
        number of comments to return data on
    subreddit: string
        subreddit the comments originated from
        
    Returns
    -------
    commentData: list
        list of lists containing individual comment score, emoji count, individual profanity count, and total profanity count
    lastTime: int
        unix time that last comment read was posted
    totalEmojis: list
        list containing all unicode of all emojis found in all comments (max 1000 at a time) queried
    totalProfanity: list
        list of total specific profanity counts in all comments (max 1000 at a time) queried
    commentCount: int
        total comments queried (max 1000). Will likely be larger than limit, unless subreddit has a very high frequency of emojis
    """
    # initialize storage
    commentData = []
    emojiCount = 0
    profanityCount = 0
    normalCount = 0
    commentCount = 0
    totalEmojis = []
    totalProfanity = []    
    
    # searches each comment in comments
    for comment in comments:
        # ends if comments to be returned equal or surpass the limit
        if len(commentData) >= limit: break

        individualCommentData = []
        # extracts emojis unicode for each comment
        emojis = extract_emojis(comment['body'])
        
        # adds all unicode to a comprehensive list
        totalEmojis += emojis          
        
        # extracts proganity counts from comment
        profanityArray = extract_profanity(comment['body'])
        totalProfanity.append(profanityArray)
        
        #appends comment score, total emojis, specific and total profanity counts, and subreddit to individualCommentData
        individualCommentData.append(comment['score'])
        individualCommentData.append(len(emojis))
        individualCommentData.append(sum(profanityArray))
        individualCommentData.append(len(comment['body'].split()))
        individualCommentData.append(subreddit)
        
        # appends comments with emojis to the overall comment data. Increments emoji comment count
        if len(emojis) > 0:
            commentData.append(individualCommentData)
            emojiCount += 1
            
        # appends comments with profanity to the overall comment data if there are less comments with profanity than emojis
        # Increments profanity comment count
        if len(emojis) == 0 and sum(profanityArray) > 0 and profanityCount < emojiCount:
            commentData.append(individualCommentData)
            profanityCount += 1
            
        # appends comments without emojis or profanity to the overall comment data if there are less comments without emojis and profanity
        # Increments profanity comment count       
        if len(emojis) == 0 and normalCount < emojiCount:
            commentData.append(individualCommentData)
            normalCount += 1
        
        #increments comment count to summarize total number of comments queried to reach limit 
        commentCount += 1
    
    lastTime = comment['created_utc']
    #sums all profanity for all comments queried
    totalProfanity = [sum(i) for i in zip(*totalProfanity)]
    
    return commentData, lastTime, totalEmojis, totalProfanity, commentCount

def extract_all_commentData(after, before, subreddit, limit):
    """Calls extract_commentData on comments queried from PushShift. PushShift has a query limit of 1000 comments. extract_all_commentData will
    continue to call extract_commentData untul the desired number of comments is reached.
    
    Parameters
    ----------
    after: int
        starting time to query comments from
    before: int
        ending time to query comments form
    subreddit: string
        subreddit to pull comments from
    limit:int
        limit of comments to retrieve and save
    Returns
    -------
    commentData: list
        list of lists containing individual comment score, emoji count, individual profanity count, and total profanity count
    lastTime: int
        unix time that last comment read was posted
    totalEmojis: list
        list containing all unicode of all emojis found in all comments (no max) queried
    totalProfanity: list
        list of total specific profanity counts in all comments (no max) queried
    commentCount: int
        total comments queried. Will likely be larger than limit, unless subreddit has a very high frequency of emojis
    """
    #first line of csv
    allCommentData = [['score', 'emojiCount', 'profanityCount', 'wordCount', 'subreddit']]
    # initialize storage   
    totalCommentCount = 0
    totalEmojis = []
    totalProfanity = [0, 0, 0, 0]

    #PushShift query. Pulls 1000 comments at a time 
    while True:
        with urllib.request.urlopen("https://api.pushshift.io/reddit/comment/search/?subreddit=" + subreddit + "&size=1000&after=" + str(after) + "&before=" + str(before)) as url:
            pushShiftData = json.loads(url.read().decode())
        #breaks loops if limit is reached or query is empty
        if limit <= 0 or len(pushShiftData["data"]) <= 0: break
            
        commentData, after, emojis, profanity, commentCount= extract_commentData(pushShiftData["data"], limit, subreddit)
        limit -= len(commentData)
        allCommentData += commentData
        totalEmojis += emojis
        totalProfanity = [totalProfanity[i]+profanity[i] for i in range(len(profanity))]
        totalCommentCount += commentCount

    return allCommentData, totalCommentCount, totalEmojis, totalProfanity

def writeData(after, before, subredditArray, limit):
    """Calls extract_all_commentData on subreddits specified and writes comment data to CSV. Total profanity count, total emoji count, and total comments 
    for all subreddits are written to 1 JSON.
    
    Parameters
    ----------
    after: int
        starting time to query comments from. Passed to extract_all_commentData
    before: int
        ending time to query comments form. Passed to extract_all_commentData
    subreddit: string
        subreddit to pull comments from. Passed to extract_all_commentData
    limit:int
        limit of comments to retrieve and save. Passed to extract_all_commentData
    """
    jsonData = []
    #loops through subreddit array and extracts data
    for subreddit in subredditArray:  
        allCommentData, commentCount, allEmojis, allProfanity = extract_all_commentData(after, before, subreddit, limit)
        # writes comment data to csvs named after the subreddit they came from
        
        if len(subreddit) == 0:
            subreddit = 'all'
        
        with open('../analysis/data/' + subreddit + '.csv', 'w') as outcsv:   
            #configure writer to write standard csv file
            writer = csv.writer(outcsv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
            for commentData in allCommentData:
                #Write item to outcsv
                writer.writerow(commentData)

        # counts 3 most common emojis in each subreddit
        emojis_to_count = (emoji for emoji in allEmojis)
        emojiCounter = Counter(emojis_to_count)
        emojiTop3 = dict(emojiCounter.most_common(3))
        
        # counts specific profanity in each subreddit
        swearWords = ['f---', 's---', 'b----', 'd---']
        profanityCounter = []
        for index, swear in enumerate(swearWords):
            profanityCounter.append((swear, allProfanity[index]))
        # adds emoji and profanity counts to dict
        dic = dict(profanityCounter, **emojiTop3)
        subredditData = dict({'totalComments':commentCount, 'emojiCount': len(allEmojis), 'profanityCount': sum(allProfanity)}, **dic)
        subredditData['subreddit'] = subreddit
        # appends subreddit name to dict
        jsonData.append(subredditData)
        
        print(subreddit + " finished")
        
    # writes emoji and profanity counts and subreddit name to JSON
    with open('../analysis/data/countData.json', 'w') as outjson:
        json.dump(jsonData , outjson)
        
        
#########################################################################################################################

# subreddits = ['funny', 'changemyview', 'dataisbeautiful', 'nba', 'emojipasta']
subreddits = ['dataisbeautiful']
writeData('2019-04-27', '2019-06-28', subreddits, 1500)

print("")
print("--- %s seconds ---" % (time.time() - start_time))
print("")
print('done')


hello
dataisbeautiful finished

--- 89.21494483947754 seconds ---

done
