###  1. Imports

In [3]:
import os
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np
import praw
from dotenv import load_dotenv

load_dotenv()

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
ps = nltk.PorterStemmer()
from nltk.tokenize import word_tokenize

import re

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aligo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aligo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Define functions to get subreddit posts data from Reddit and write it to files

In [2]:
user_agent = "Scraper 1.0 by /u/Alliix"
reddit = praw.Reddit(
  client_id=os.environ['CLIENT_ID'],
  client_secret=os.environ['CLIENT_SECRET'],
  user_agent=user_agent
)

In [39]:
def cleanPostsData(postsFromCsv):
#     clean posts data
    postsProcessed = []

#     RegEx
    zeroSpaceWidth = re.compile(r'&#x200B')
    urls = re.compile(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', re.IGNORECASE)
    numbers = re.compile(r'\d+(\.\d+)?')
    punctuation = re.compile(r'[^\w\d\s]')
    whitespaces = re.compile(r'\s+')
    leadTrailWhitespace = re.compile(r'^\s+|\s+?$')

    for d in postsFromCsv.Post:
        if(type(d)==str):
            x = d
            # Replace zero width space with ' '
            x = zeroSpaceWidth.sub(' ',x)
            # Replace URLs with 'url'
            x = urls.sub('url',x)
            # Replace numbers with 'nmbr'
            x = numbers.sub('nmbr',x)
            # Remove punctuation
            x = punctuation.sub(' ',x)
            # Replace whitespace between terms with ' '
            x = whitespaces.sub(' ',x)
            # Remove leading and trailing whitespace
            x = leadTrailWhitespace.sub(' ',x)

            # remove stop words from text messages
            text_tokens = word_tokenize(x)
            tokens_without_sw = [word for word in text_tokens if not word in stop_words]

            # Remove word stems using a Porter stemmer
            tokens_without_ws = [ps.stem(word) for word in tokens_without_sw]

            x = (" ").join(tokens_without_ws)

            postsProcessed.append(x)
        else: 
            postsProcessed.append(d)
            
    return postsProcessed


In [40]:
def getAndCleanPostsData(subreddit, postOutputFile, processedPostOutputFile):
#     get
    posts = set()
    for submission in reddit.subreddit(subreddit).new(limit=None):
        posts.add(submission)
    postsLength = len(posts)
    
#     write posts to file
    data = []
    for post in posts:
        data.append({'Author': post.author, 'Subreddit': post.subreddit, 'Date': post.created_utc, 'Title': post.title, 'Post': post.selftext})    
    postsData = pd.DataFrame(data, columns=['Author', 'Subreddit', 'Date', 'Title', 'Post'])
    postsData.to_csv(postOutputFile, index=False)
    
    postsFromCsv = pd.read_csv(postOutputFile)
    postsProcessed = cleanPostsData(postsFromCsv)
    
#     write processed posts data to file
    processedData = []
    for i in range(0, postsLength):
        processedData.append({'Author': postsFromCsv.Author[i], 'Subreddit': postsFromCsv.Subreddit[i], 'Date': postsFromCsv.Date[i], 'Title': postsFromCsv.Title[i], 'Post': postsProcessed[i]})

    postsData = pd.DataFrame(processedData, columns=['Author', 'Subreddit', 'Date', 'Title', 'Post'])
    postsData.to_csv(processedPostOutputFile, index=False)
    
    return postsLength

### 3. Run the functions to get data from subreddits

In [41]:
# r/depression
depressedPostsLength = getAndCleanPostsData('depression', 'depressed_posts.csv', 'depressed_posts_processed.csv')
print(depressedPostsLength)

1000


In [42]:
# r/unpopularopinion
unpopularopinionPostsLength = getAndCleanPostsData('unpopularopinion', 'unpopularopinion_posts.csv', 'unpopularopinion_posts_processed.csv')
print(unpopularopinionPostsLength)

989


In [43]:
# r/lonely
lonelyPostsLength = getAndCleanPostsData('lonely', 'lonely_posts.csv', 'lonely_posts_processed.csv')
print(lonelyPostsLength)

979


In [44]:
# r/MachineLearning
machinelearningPostsLength = getAndCleanPostsData('machinelearning', 'machinelearning_posts.csv', 'machinelearning_posts_processed.csv')
print(machinelearningPostsLength)

979


### 4. Generating Features function
Features will be the most common words in posts.

In [45]:
def getWordFrequency(processedPostOutputFile):
    posts = pd.read_csv(processedPostOutputFile)

    allPostsConcat = ''
    for post in posts.Post:
        if(type(post)==str):
            allPostsConcat+=post

    # create bag-of-words
    all_words = []

    words = word_tokenize(allPostsConcat)
    for word in words:
        all_words.append(word)

    all_words = nltk.FreqDist(all_words)
    
    return all_words

### 5. Generate Word Frequency for Subreddits

In [46]:
# r/depression
depressionFreqWords = getWordFrequency('depressed_posts_processed.csv')

# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(depressionFreqWords)))
print('Most common words: {}'.format(depressionFreqWords.most_common(15)))

Number of words: 6235
Most common words: [('i', 11562), ('feel', 1547), ('like', 1268), ('nmbr', 973), ('want', 937), ('get', 886), ('know', 842), ('life', 734), ('depress', 683), ('even', 668), ('time', 634), ('go', 588), ('think', 525), ('year', 522), ('day', 515)]


In [47]:
# r/unpopularopinion
unpopularopinionFreqWords = getWordFrequency('unpopularopinion_posts_processed.csv')

# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(unpopularopinionFreqWords)))
print('Most common words: {}'.format(unpopularopinionFreqWords.most_common(15)))

Number of words: 8396
Most common words: [('i', 2799), ('peopl', 939), ('like', 915), ('nmbr', 726), ('get', 603), ('it', 454), ('make', 443), ('think', 421), ('one', 394), ('want', 358), ('thing', 358), ('say', 345), ('time', 343), ('even', 337), ('the', 323)]


In [48]:
# r/lonely
lonelyFreqWords = getWordFrequency('lonely_posts_processed.csv')

# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(lonelyFreqWords)))
print('Most common words: {}'.format(lonelyFreqWords.most_common(15)))

Number of words: 5676
Most common words: [('i', 8824), ('feel', 1158), ('like', 1097), ('friend', 1015), ('nmbr', 848), ('want', 806), ('peopl', 758), ('get', 653), ('know', 606), ('talk', 600), ('one', 599), ('time', 585), ('go', 569), ('life', 552), ('even', 532)]


In [49]:
# r/machinelearning
machinelearningFreqWords = getWordFrequency('machinelearning_posts_processed.csv')

# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(machinelearningFreqWords)))
print('Most common words: {}'.format(machinelearningFreqWords.most_common(15)))

Number of words: 9351
Most common words: [('nmbr', 4430), ('http', 2531), ('i', 2283), ('com', 1581), ('model', 980), ('use', 916), ('learn', 740), ('data', 602), ('the', 525), ('train', 521), ('reddit', 509), ('week', 485), ('paper', 466), ('www', 446), ('github', 434)]


### 6. Generate features for all posts
We will tokenize each word and will use the 2000 most common words as features.

In [50]:
# Save all posts to one file

depressedData = pd.read_csv('depressed_posts_processed.csv')
unpopularopinionData = pd.read_csv('unpopularopinion_posts_processed.csv')
lonelyData = pd.read_csv('lonely_posts_processed.csv')
machinelearningData = pd.read_csv('machinelearning_posts_processed.csv')

allPosts = depressedData.append(unpopularopinionData).append(lonelyData).append(machinelearningData)

allPosts.to_csv('all_posts_processed.csv', index=False)

In [51]:
# check subreddit distribution

subreddits = allPosts.Subreddit
print(subreddits.value_counts())

depression          1000
unpopularopinion     989
lonely               979
MachineLearning      979
Name: Subreddit, dtype: int64


In [53]:
all_words_combined = getWordFrequency('all_posts_processed.csv')

# print the total number of words and the 100 most common words
print('Number of words: {}'.format(len(all_words_combined)))
print('Most common words: {}'.format(all_words_combined.most_common(100)))

Number of words: 18226
Most common words: [('i', 25466), ('nmbr', 6977), ('like', 3684), ('feel', 3050), ('http', 2595), ('peopl', 2340), ('get', 2325), ('want', 2295), ('know', 1932), ('time', 1850), ('one', 1749), ('make', 1722), ('even', 1658), ('com', 1625), ('friend', 1585), ('go', 1571), ('use', 1520), ('would', 1506), ('life', 1489), ('think', 1488), ('it', 1452), ('thing', 1354), ('realli', 1320), ('tri', 1251), ('the', 1251), ('year', 1242), ('work', 1165), ('day', 1161), ('talk', 1084), ('much', 1004), ('model', 994), ('never', 973), ('someon', 963), ('say', 960), ('way', 916), ('see', 914), ('good', 898), ('need', 888), ('learn', 884), ('depress', 851), ('help', 823), ('also', 823), ('person', 809), ('someth', 806), ('week', 795), ('love', 792), ('start', 786), ('look', 786), ('but', 785), ('live', 770), ('could', 763), ('alway', 746), ('take', 737), ('still', 735), ('better', 696), ('anyon', 690), ('got', 657), ('find', 657), ('and', 652), ('my', 648), ('thought', 638), ('b

In [59]:
# we will use the 2000 most common words as features

word_features = list(all_words_combined.keys())[:2000]

### 7. FindFeatures fuction
function will determine which of the 2000 word features are contained in the post

In [62]:
def find_features(post):
    words = word_tokenize(post)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

In [63]:
# example
posts = pd.read_csv('all_posts_processed.csv')

features = find_features(posts.Post[1])
for key, value in features.items():
    if value == True:
        print(key)

like
go
back
keep
life
felt
top
world
friday
night
everyth
now
i
feel
low
put
act
whenev
peopl
let
get
walk
care
ask
would
come
home
drink
stop
make
parti
hour
small
but
alon
thought
start


### 6. Save training, testing data

In [67]:
Y = posts.Subreddit
postsStr = ''
for post in posts.Post:
    if(type(post)==str):
        postsStr+=post
    
# Now find features for all posts
posts_all = list(zip(postsStr, Y))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(posts_all)

# call find_features function for each post
featuresets = [(find_features(text), label) for (text, label) in posts_all]

In [68]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

print(len(posts))
print(len(training))
print(len(testing))

3947
2960
987


### 4. Scikit-Learn Classifiers with NLTK
Now that we have our dataset, we can start building algorithms! Let's start with a simple linear support vector classifier, then expand to other algorithms. We'll need to import each algorithm we plan on using from sklearn. We also need to import some performance metrics, such as accuracy_score and classification_report.

In [69]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 22.99898682877406


In [70]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 24.113475177304963
Decision Tree Accuracy: 23.10030395136778
Random Forest Accuracy: 23.10030395136778
Logistic Regression Accuracy: 22.99898682877406
SGD Classifier Accuracy: 22.391084093211752
Naive Bayes Accuracy: 22.99898682877406
SVM Linear Accuracy: 22.99898682877406


In [71]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 22.99898682877406


In [72]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [74]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual','actual', 'actual'], ['depression', 'unpopularopinion', 'lonely', 'MachineLearning']],
    columns = [['predicted', 'predicted','predicted', 'predicted'], ['depression', 'unpopularopinion', 'lonely', 'MachineLearning']])

                  precision    recall  f1-score   support

 MachineLearning       0.22      0.07      0.11       248
      depression       0.31      0.10      0.15       281
          lonely       0.12      0.02      0.03       231
unpopularopinion       0.23      0.78      0.35       227

        accuracy                           0.23       987
       macro avg       0.22      0.24      0.16       987
    weighted avg       0.22      0.23      0.16       987



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,depression,unpopularopinion,lonely,MachineLearning
actual,depression,18,25,11,194
actual,unpopularopinion,20,29,11,221
actual,lonely,21,23,4,183
actual,MachineLearning,24,18,8,177
