In [139]:
import pandas as pd
import datetime as dt
import praw
from collections import Counter

import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.utils.extmath import randomized_svd
from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jongbusherpa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


For the below, follow the directions here https://www.storybench.org/how-to-scrape-reddit-with-python/

In [141]:
PERSONAL_USE_SCRIPT_14_CHARS = 'DlOSzt8yX1sHEA'
SECRET_KEY_27_CHARS = 'akhRmZhbpUVRlPOc67CeJQSqIDY'
YOUR_APP_NAME = 'NoSleepRecommender'
YOUR_REDDIT_USER_NAME = 'josephs7'
YOUR_REDDIT_LOGIN_PASSWORD = 'Computer1'

In [142]:
reddit = praw.Reddit(client_id=PERSONAL_USE_SCRIPT_14_CHARS,
                     client_secret=SECRET_KEY_27_CHARS,
                     password=YOUR_REDDIT_LOGIN_PASSWORD,
                     user_agent=YOUR_APP_NAME,
                     username=YOUR_REDDIT_USER_NAME)

In [143]:
print(reddit.user.me())

josephs7


In [144]:
subreddit = reddit.subreddit('nosleep')

# Gettting Stories and Comments

In [145]:
stories_dict = {"story_id": [],
                "title": [],
               "author": [],
               "body": []}

In [146]:
my_subreddit = subreddit.hot(limit=1000)
for submission in my_subreddit:
    stories_dict["title"].append(submission.title)
    stories_dict["body"].append(submission.selftext)
    stories_dict["author"].append(submission.author)
    stories_dict["story_id"].append(submission.id)

In [147]:
story_df = pd.DataFrame(stories_dict)
story_df = story_df.head(500)
story_df.head()

Unnamed: 0,story_id,title,author,body
0,fecu80,January 2020 Winners!,poppy_moonray,
1,fmxcnp,February 2020 Voting Thread,TheCusterWolf,
2,fmz97y,"I’m a leaker, but the document I’m disclosing ...",TheVaticanArchivist,*The document that I’m about to leak contains ...
3,fmzcxo,JIM'S NOT HERE,Max-Voynich,"I leave dinner to pick up the phone, cupping m..."
4,fmtlbu,How to Survive Camping: the town should fear me,fainting--goat,I run a private campground. Last time I told ...


In [148]:
story_df = story_df.drop([0, 1], )
story_df.head(100)

Unnamed: 0,story_id,title,author,body
2,fmz97y,"I’m a leaker, but the document I’m disclosing ...",TheVaticanArchivist,*The document that I’m about to leak contains ...
3,fmzcxo,JIM'S NOT HERE,Max-Voynich,"I leave dinner to pick up the phone, cupping m..."
4,fmtlbu,How to Survive Camping: the town should fear me,fainting--goat,I run a private campground. Last time I told ...
5,fn12rb,I’m a Retired Priest. Over the years I’ve hear...,Colourblindness,| [First Sin](https://www.reddit.com/r/nosleep...
6,fn577x,I work at an amusement park. Only half of the ...,girl_from_the_crypt,I should start off by explaining a couple thin...
...,...,...,...,...
97,fmg86g,Every times I punish my son for watching TV or...,Limited_Life,As the title says. Every times I punish my son...
98,fmg5fa,Something's wrong and for once it isn't the pe...,ArgonSteel,"The time is 06:50, March 21st of 2020...\n\nYo..."
99,fm5p1x,"I tried lucid dreaming, it wasn't worth it.",Teffler,I had heard about lucid dreaming through a fri...
100,fm648g,It Creeps,BLACKMASS81,\nI did not believe in the supernatural. Ghos...


# Testing LDA

In [150]:
# clean text

# remove punctuation
story_df['processed'] = story_df['body'].map(lambda x: re.sub('[,\.!?]', '', x))
story_df.head()
# make it all lower case
story_df['processed'] = story_df['processed'].map(lambda x: x.lower())
    
story_df.head(100)

Unnamed: 0,story_id,title,author,body,processed
2,fmz97y,"I’m a leaker, but the document I’m disclosing ...",TheVaticanArchivist,*The document that I’m about to leak contains ...,*the document that i’m about to leak contains ...
3,fmzcxo,JIM'S NOT HERE,Max-Voynich,"I leave dinner to pick up the phone, cupping m...",i leave dinner to pick up the phone cupping my...
4,fmtlbu,How to Survive Camping: the town should fear me,fainting--goat,I run a private campground. Last time I told ...,i run a private campground last time i told y...
5,fn12rb,I’m a Retired Priest. Over the years I’ve hear...,Colourblindness,| [First Sin](https://www.reddit.com/r/nosleep...,| [first sin](https://wwwredditcom/r/nosleep/c...
6,fn577x,I work at an amusement park. Only half of the ...,girl_from_the_crypt,I should start off by explaining a couple thin...,i should start off by explaining a couple thin...
...,...,...,...,...,...
97,fmg86g,Every times I punish my son for watching TV or...,Limited_Life,As the title says. Every times I punish my son...,as the title says every times i punish my son ...
98,fmg5fa,Something's wrong and for once it isn't the pe...,ArgonSteel,"The time is 06:50, March 21st of 2020...\n\nYo...",the time is 06:50 march 21st of 2020\n\nyou've...
99,fm5p1x,"I tried lucid dreaming, it wasn't worth it.",Teffler,I had heard about lucid dreaming through a fri...,i had heard about lucid dreaming through a fri...
100,fm648g,It Creeps,BLACKMASS81,\nI did not believe in the supernatural. Ghos...,\ni did not believe in the supernatural ghost...


# Creating vocabulary of all words in our data

In [158]:
from sklearn.feature_extraction.text import CountVectorizer

# we use the CountVectorizer class from the sklearn.feature_extraction.text module to create a document-term matrix. 
# We specify to only include those words that appear in less than 80% of the document and appear in at least 2 documents. 
# We also remove all the stop words as they do not really contribute to topic modeling.
count_vect = CountVectorizer(max_df=0.8, min_df=3, stop_words='english')
doc_term_matrix = count_vect.fit_transform(story_df['body'].values.astype('U'))

In [152]:
doc_term_matrix

<498x13606 sparse matrix of type '<class 'numpy.int64'>'
	with 212038 stored elements in Compressed Sparse Row format>

# Creating topics with probability distribution for each word in our vocabulary

In [164]:
from sklearn.decomposition import LatentDirichletAllocation

# we use the LatentDirichletAllocation class from the sklearn.decomposition library to perform LDA on our document-term matrix.
# The parameter n_components specifies the number of categories, or topics, that we want our text to be divided into.
# The parameter random_state = the seed
LDA = LatentDirichletAllocation(n_components=10, random_state=80)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=80, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

# Randomly fetching words from our vocabulary

In [165]:
import random

for i in range(10):
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

efforts
truly
circles
bowl
loved
tree
illuminating
disinfectant
trapdoor
agreement


# Find 10 words with the highest probablility for the first topic

In [166]:
first_topic = LDA.components_[0]

# Sorting the indexes according to the probability values
# Once sorted, the 10 words with the highest probabilities will now belong to the last 10 indexes of the array. 
# The following script returns the indexes of the 10 words with the highest probabilities.
top_topic_words = first_topic.argsort()[-10:]
for i in top_topic_words:
    print(count_vect.get_feature_names()[i])

door
went
day
told
asked
got
did
looked
don
said


# Printing 10 words with highest probabilities for all the five topics

In [167]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['door', 'went', 'day', 'told', 'asked', 'got', 'did', 'looked', 'don', 'said']


Top 10 words for topic #1:
['eyes', 'sam', 'felt', 'room', 'way', 'door', 'head', 'looked', 'mark', 'frankie']


Top 10 words for topic #2:
['long', 'knife', 'did', 'voice', 'head', 'eyes', 've', 'cheryl', 'man', 'michael']


Top 10 words for topic #3:
['ray', 'tim', 'got', 'beth', 'james', 'room', 'looked', 'asked', 'door', 'said']


Top 10 words for topic #4:
['eyes', 'man', 'town', 'don', 'people', 'roger', 'said', 'old', 'house', 'felt']


Top 10 words for topic #5:
['security', 'night', 'room', 'thing', 'minutes', 'creature', 'school', 'rules', 'said', 'door']


Top 10 words for topic #6:
['eyes', 'looked', 'got', 'cole', 'thing', 'said', 'kyle', 'girlfriend', 'grandma', 'dog']


Top 10 words for topic #7:
['home', 'went', 'got', 'night', 'really', 've', 'house', 'door', 'don', 'room']


Top 10 words for topic #8:
['body', 'night', 'looked', 'way', 'head', 'felt', 'said', '

# Adding topic to the stories

In [168]:
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

# The following script adds a new column for topic in the data frame 
# and assigns the topic value to each row in the column
story_df['Topic'] = topic_values.argmax(axis=1)
story_df.head(100)

Unnamed: 0,story_id,title,author,body,processed,Topic
2,fmz97y,"I’m a leaker, but the document I’m disclosing ...",TheVaticanArchivist,*The document that I’m about to leak contains ...,*the document that i’m about to leak contains ...,8
3,fmzcxo,JIM'S NOT HERE,Max-Voynich,"I leave dinner to pick up the phone, cupping m...",i leave dinner to pick up the phone cupping my...,9
4,fmtlbu,How to Survive Camping: the town should fear me,fainting--goat,I run a private campground. Last time I told ...,i run a private campground last time i told y...,4
5,fn12rb,I’m a Retired Priest. Over the years I’ve hear...,Colourblindness,| [First Sin](https://www.reddit.com/r/nosleep...,| [first sin](https://wwwredditcom/r/nosleep/c...,0
6,fn577x,I work at an amusement park. Only half of the ...,girl_from_the_crypt,I should start off by explaining a couple thin...,i should start off by explaining a couple thin...,6
...,...,...,...,...,...,...
97,fmg86g,Every times I punish my son for watching TV or...,Limited_Life,As the title says. Every times I punish my son...,as the title says every times i punish my son ...,5
98,fmg5fa,Something's wrong and for once it isn't the pe...,ArgonSteel,"The time is 06:50, March 21st of 2020...\n\nYo...",the time is 06:50 march 21st of 2020\n\nyou've...,0
99,fm5p1x,"I tried lucid dreaming, it wasn't worth it.",Teffler,I had heard about lucid dreaming through a fri...,i had heard about lucid dreaming through a fri...,8
100,fm648g,It Creeps,BLACKMASS81,\nI did not believe in the supernatural. Ghos...,\ni did not believe in the supernatural ghost...,9
