# Exploring alcohol-related content on Reddit

In [4]:
subreddits = ["alcohol",
"stopdrinking",
"cripplingalcoholism",
"drugscirclejerk",
"Drugs",
"AmItheAsshole",
"beer",
"unpopularopinion",
"drunk",
"Alcoholism_Medication",
"dryalcoholics",
"AlcoholGifRecipes",
"alcoholism",
"alcoholicsanonymous",
"Sober",
"ShittyLifeProTips",
"AlAnon",
"AskReddit",
"todayilearned",
"Showerthoughts",
"trees",
"exmormon",
"teenagers",
"NoStupidQuestions",
"explainlikeimfive",
"science",
"funny",
"worldnews",
"Art",
"keto",
"news",
"politics",
"askscience",
"memes",
"pics",
"SkincareAddiction",
"ireland",
"australia",
"Marijuana",
"AskHistorians",
"LifeProTips",
"mildlyinteresting",
"india",
"kratom",
"Jokes",
"AdviceAnimals",
"Homebrewing",
"conspiracy",
"Health",
"loseit",
"canada",
"alcoholic",
"alcoholabuse",
"alcoholfreebeer",
"alcoholismprotips",
"AlcoholInkArt"]

In [12]:
import json
import pandas as pd
from langdetect import detect

from wordcloud import WordCloud
import matplotlib.pyplot as plt

from nltk.probability import FreqDist

#helper functions to clean up texts by eliminating stopwords and urls
from nltk.corpus import stopwords
from nltk.tokenize.casual import TweetTokenizer
import string

STOP_PREFIXES = ("@", "http", "&amp")

def clean_text(comment):
    words = []

    for word in TweetTokenizer().tokenize(comment):
        if word not in list(string.punctuation) and word not in stopwords.words("english") and not word.startswith(STOP_PREFIXES):
            words.append(word)
    
    return (" ").join(words)

#helper functions to get sentiment scores
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def find_positive(comment):
    return sia.polarity_scores(comment)["pos"]

def find_neutral(comment):
    return sia.polarity_scores(comment)["neu"]

def find_negative(comment):
    return sia.polarity_scores(comment)["neg"]

def find_compound(comment):
    return sia.polarity_scores(comment)["compound"]

#main analyses
for subreddit in subreddits:
    texts = []
    for line in open('reddit-@{}.json'.format(subreddit), 'r'):
        texts.append(json.loads(line))

    reddit_text = []
    reddit_author = []

    for text in texts:
        if text["_type"] == "snscrape.modules.reddit.Comment":
            try:
                if detect(text["body"]) == "en":
                    reddit_text.append(text["body"])
                    reddit_author.append(text["author"])
            except:
                language = "error"
        
        elif text["_type"] == "snscrape.modules.reddit.Submission":
            try:
                if detect(text["title"]) == "en":
                    reddit_text.append(text["title"])
                    reddit_author.append(text["author"])
            except:
                language = "error"
    
    data = pd.DataFrame()
    data["text"] = reddit_text
    data["author"] = reddit_author

    data["text"] = data["text"].apply(clean_text)

    #sentiment analysis
    data["Positive"] = data["text"].apply(find_positive)
    data["Neutral"] = data["text"].apply(find_neutral)
    data["Negative"] = data["text"].apply(find_negative)
    data["Compound"] = data["text"].apply(find_compound)

    print("The mean positive sentiment score for {}: {}".format(subreddit, data["Negative"].mean()))
    print("The mean negative sentiment score for {}: {}".format(subreddit, data["Negative"].mean()))
    print("The mena neutral sentiment score for {}: {}".format(subreddit, data["Neutral"].mean()))

    #most frequent words
    words = " ".join(data["text"].values)
    word_cloud = WordCloud().generate(words)
    plt.imshow(word_cloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(subreddit)
    plt.show()

    freq = FreqDist(words.split(" "))
    freq.plot(15, cumulative=False, title=subreddit)
    plt.show()
    print("The most frequent 20 words in {}: {}".format(subreddit, str(list(freq)[:20])))

            
        