###  1. Imports

In [85]:
import os
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np
import praw
from dotenv import load_dotenv

load_dotenv()

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
ps = nltk.PorterStemmer()
from nltk.tokenize import word_tokenize

import re

stop_words = set(stopwords.words('english'))

print(stop_words)

{'am', 'only', 'them', 'at', 'mightn', "shan't", 'themselves', 'its', 'this', 'was', 'other', 'these', 'yourselves', 'here', 'being', 'again', 'those', 'needn', 'm', "she's", 'are', 'once', 'more', 'because', "you'd", 'own', 'while', 'down', 'as', 're', 'above', 'ours', 'you', 'himself', 'against', 'our', 'by', 'hers', 'what', 'during', 'him', 'whom', 'very', 'wouldn', 'over', 's', 'yours', 'into', 'yourself', 'it', 'from', 'we', 'who', 'does', "aren't", 'up', 'hasn', 'won', 'any', "couldn't", "you've", 'don', 'their', 'than', 'will', 'doing', 'which', 'me', 'my', "doesn't", 'didn', 't', 'were', 'out', 'until', 'off', 'couldn', 'she', "that'll", 'that', 'there', 'the', 'but', "isn't", 'some', 'having', "haven't", 'd', 'o', 'herself', 'about', 'between', 'then', "you'll", 'be', 'no', 'i', 'an', 'all', 'your', 'nor', 'under', "mustn't", 'weren', 'll', 'for', "you're", "should've", 'ma', 'same', 'and', 'before', 'how', "it's", 'ourselves', 'been', 'most', 'theirs', "won't", 'her', 'can', 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aligo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aligo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Define functions to get subreddit posts data from Reddit and write it to files

In [2]:
user_agent = "Scraper 1.0 by /u/Alliix"
reddit = praw.Reddit(
  client_id=os.environ['CLIENT_ID'],
  client_secret=os.environ['CLIENT_SECRET'],
  user_agent=user_agent
)

In [101]:
def cleanPostsData(postsFromCsv):
#     clean posts data
    postsProcessed = []

#     RegEx
    zeroSpaceWidth = re.compile(r'&#x200B')
    urls = re.compile(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', re.IGNORECASE)
    numbers = re.compile(r'\d+(\.\d+)?')
    punctuation = re.compile(r'[^\w\d\s]')
    whitespaces = re.compile(r'\s+')
    leadTrailWhitespace = re.compile(r'^\s+|\s+?$')

    for d in postsFromCsv.Post:
        if(type(d)==str):
            x = d
            # Replace zero width space with ' '
            x = zeroSpaceWidth.sub(' ',x)
            # Replace URLs with 'url'
            x = urls.sub('url',x)
            # Replace numbers with 'nmbr'
            x = numbers.sub('nmbr',x)
            # Remove punctuation
            x = punctuation.sub(' ',x)
            # Replace whitespace between terms with ' '
            x = whitespaces.sub(' ',x)
            # Remove leading and trailing whitespace
            x = leadTrailWhitespace.sub(' ',x)

            text_tokens = word_tokenize(x)
            # Remove word stems using a Porter stemmer
            tokens_without_ws = [ps.stem(word) for word in text_tokens]
            x = (" ").join(tokens_without_ws)
            
            # remove stop words from posts
            text_tokens = word_tokenize(x)
            tokens_without_sw = [word for word in text_tokens if not word in stop_words]
            x = (" ").join(tokens_without_sw)
            postsProcessed.append(x)
        else: 
            postsProcessed.append(d)
            
    return postsProcessed


In [102]:
def getAndCleanPostsData(subreddit, postOutputFile, processedPostOutputFile):
#     get
    posts = set()
    for submission in reddit.subreddit(subreddit).new(limit=None):
        posts.add(submission)
    postsLength = len(posts)
    
#     write posts to file
    data = []
    for post in posts:
        data.append({'Author': post.author, 'Subreddit': post.subreddit, 'Date': post.created_utc, 'Title': post.title, 'Post': post.selftext})    
    postsData = pd.DataFrame(data, columns=['Author', 'Subreddit', 'Date', 'Title', 'Post'])
    postsData.to_csv(postOutputFile, index=False)
    
    postsFromCsv = pd.read_csv(postOutputFile)
    postsProcessed = cleanPostsData(postsFromCsv)
    
#     write processed posts data to file
    processedData = []
    for i in range(0, postsLength):
        processedData.append({'Author': postsFromCsv.Author[i], 'Subreddit': postsFromCsv.Subreddit[i], 'Date': postsFromCsv.Date[i], 'Title': postsFromCsv.Title[i], 'Post': postsProcessed[i]})

    postsData = pd.DataFrame(processedData, columns=['Author', 'Subreddit', 'Date', 'Title', 'Post'])
    postsData.to_csv(processedPostOutputFile, index=False)
    
    return postsLength

### 3. Run the functions to get data from subreddits

In [103]:
# r/depression
depressedPostsLength = getAndCleanPostsData('depression', 'depressed_posts.csv', 'depressed_posts_processed.csv')
print(depressedPostsLength)

1000


In [107]:
# r/unpopularopinion
unpopularopinionPostsLength = getAndCleanPostsData('unpopularopinion', 'unpopularopinion_posts.csv', 'unpopularopinion_posts_processed.csv')
print(unpopularopinionPostsLength)

989


In [108]:
# r/lonely
lonelyPostsLength = getAndCleanPostsData('lonely', 'lonely_posts.csv', 'lonely_posts_processed.csv')
print(lonelyPostsLength)

977


In [109]:
# r/MachineLearning
machinelearningPostsLength = getAndCleanPostsData('machinelearning', 'machinelearning_posts.csv', 'machinelearning_posts_processed.csv')
print(machinelearningPostsLength)

979


### 4. Generating Features function
Features will be the most common words in posts.

In [110]:
def getWordFrequency(processedPostOutputFile):
    posts = pd.read_csv(processedPostOutputFile)

    allPostsConcat = ''
    for post in posts.Post:
        if(type(post)==str):
            allPostsConcat+=post

    # create bag-of-words
    all_words = []

    words = word_tokenize(allPostsConcat)
    for word in words:
        all_words.append(word)

    all_words = nltk.FreqDist(all_words)
    
    return all_words

### 5. Generate Word Frequency for Subreddits

In [111]:
# r/depression
depressionFreqWords = getWordFrequency('depressed_posts_processed.csv')

# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(depressionFreqWords)))
print('Most common words: {}'.format(depressionFreqWords.most_common(15)))

Number of words: 6316
Most common words: [('feel', 1522), ('wa', 1427), ('thi', 1309), ('like', 1274), ('nmbr', 949), ('want', 923), ('get', 881), ('know', 819), ('becaus', 783), ('life', 736), ('depress', 679), ('even', 658), ('time', 642), ('go', 590), ('year', 524)]


In [112]:
# r/unpopularopinion
unpopularopinionFreqWords = getWordFrequency('unpopularopinion_posts_processed.csv')

# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(unpopularopinionFreqWords)))
print('Most common words: {}'.format(unpopularopinionFreqWords.most_common(15)))

Number of words: 8299
Most common words: [('peopl', 924), ('like', 898), ('thi', 787), ('nmbr', 729), ('get', 580), ('becaus', 486), ('make', 441), ('wa', 429), ('think', 404), ('one', 390), ('want', 353), ('time', 346), ('say', 345), ('thing', 342), ('even', 329)]


In [113]:
# r/lonely
lonelyFreqWords = getWordFrequency('lonely_posts_processed.csv')

# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(lonelyFreqWords)))
print('Most common words: {}'.format(lonelyFreqWords.most_common(15)))

Number of words: 5659
Most common words: [('feel', 1129), ('like', 1096), ('wa', 1054), ('thi', 1003), ('friend', 999), ('nmbr', 809), ('want', 787), ('peopl', 757), ('get', 647), ('talk', 597), ('one', 589), ('know', 583), ('time', 577), ('go', 564), ('life', 546)]


In [114]:
# r/machinelearning
machinelearningFreqWords = getWordFrequency('machinelearning_posts_processed.csv')

# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(machinelearningFreqWords)))
print('Most common words: {}'.format(machinelearningFreqWords.most_common(15)))

Number of words: 9335
Most common words: [('nmbr', 4434), ('http', 2534), ('com', 1582), ('thi', 1138), ('model', 983), ('use', 913), ('learn', 741), ('data', 604), ('train', 518), ('reddit', 509), ('week', 485), ('paper', 464), ('www', 447), ('github', 434), ('org', 423)]


### 6. Generate features for all posts
We will tokenize each word and will use the 2000 most common words as features.

In [115]:
# Save all posts to one file

depressedData = pd.read_csv('depressed_posts_processed.csv')
unpopularopinionData = pd.read_csv('unpopularopinion_posts_processed.csv')
lonelyData = pd.read_csv('lonely_posts_processed.csv')
machinelearningData = pd.read_csv('machinelearning_posts_processed.csv')

allPosts = depressedData.append(unpopularopinionData).append(lonelyData).append(machinelearningData)

allPosts.to_csv('all_posts_processed.csv', index=False)

In [116]:
# check subreddit distribution

subreddits = allPosts.Subreddit
print(subreddits.value_counts())

depression          1000
unpopularopinion     989
MachineLearning      979
lonely               977
Name: Subreddit, dtype: int64


In [117]:
all_words_combined = getWordFrequency('all_posts_processed.csv')

# print the total number of words and the 100 most common words
print('Number of words: {}'.format(len(all_words_combined)))
print('Most common words: {}'.format(all_words_combined.most_common(100)))

Number of words: 18667
Most common words: [('nmbr', 6921), ('thi', 4237), ('like', 3675), ('wa', 3178), ('feel', 2992), ('http', 2593), ('peopl', 2323), ('get', 2292), ('want', 2249), ('becaus', 1909), ('time', 1852), ('know', 1845), ('one', 1728), ('make', 1721), ('even', 1638), ('com', 1625), ('go', 1564), ('friend', 1564), ('use', 1502), ('would', 1494), ('life', 1488), ('think', 1451), ('thing', 1340), ('realli', 1298), ('year', 1244), ('tri', 1224), ('ha', 1164), ('day', 1159), ('work', 1138), ('talk', 1073), ('ani', 1072), ('onli', 1040), ('model', 998), ('much', 991), ('say', 962), ('someon', 962), ('never', 951), ('way', 923), ('see', 890), ('good', 886), ('learn', 886), ('need', 881), ('depress', 845), ('whi', 827), ('also', 825), ('help', 823), ('someth', 804), ('week', 801), ('person', 793), ('look', 783), ('love', 782), ('start', 780), ('live', 774), ('could', 760), ('still', 734), ('take', 731), ('alway', 718), ('anyon', 696), ('better', 689), ('veri', 649), ('find', 647),

### 7. FindFeatures fuction
function will determine which of the 1500 word features are contained in the post

In [118]:
# we will use the 1500 most common words as features

word_features = list(all_words_combined.keys())[:1500]

In [119]:
def find_features(post):
    words = word_tokenize(post)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

In [120]:
# example
posts = pd.read_csv('all_posts_processed.csv')

features = find_features(posts.Post[1])
for key, value in features.items():
    if value == True:
        print(key)

like
go
back
keep
life
top
world
friday
night
everyth
wa
feel
low
put
act
whenev
peopl
let
get
walk
care
ask
would
come
home
drink
stop
becaus
make
parti
hour
small
alon
thought
start
felt


### 6. Save training, testing data

In [144]:
Y = []
postsArr = []
i = 0 
for post in posts.Post:
    if(type(post)==str):
        postsArr.append(post)
        Y.append(posts.Subreddit[i])
    i+=1
    
# Now find features for all posts
posts_all = list(zip(postsArr, Y))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(posts_all)

# call find_features function for each post
featuresets = [(find_features(text), label) for (text, label) in posts_all]

In [145]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

print(len(posts))
print(len(training))
print(len(testing))

3945
2853
952


### 4. Scikit-Learn Classifiers with NLTK
Now that we have our dataset, we can start building algorithms! Let's start with a simple linear support vector classifier, then expand to other algorithms. We'll need to import each algorithm we plan on using from sklearn. We also need to import some performance metrics, such as accuracy_score and classification_report.

In [146]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 74.78991596638656


In [147]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 41.28151260504202
Decision Tree Accuracy: 62.81512605042017
Random Forest Accuracy: 77.83613445378151


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 79.30672268907563
SGD Classifier Accuracy: 76.47058823529412
Naive Bayes Accuracy: 82.56302521008404
SVM Linear Accuracy: 74.78991596638656


In [148]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 74.78991596638656


In [149]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [150]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual','actual', 'actual'], ['depression', 'unpopularopinion', 'lonely', 'MachineLearning']],
    columns = [['predicted', 'predicted','predicted', 'predicted'], ['depression', 'unpopularopinion', 'lonely', 'MachineLearning']])

                  precision    recall  f1-score   support

 MachineLearning       0.92      0.96      0.94       213
      depression       0.69      0.69      0.69       231
          lonely       0.71      0.77      0.74       248
unpopularopinion       0.90      0.80      0.84       260

        accuracy                           0.80       952
       macro avg       0.80      0.80      0.80       952
    weighted avg       0.80      0.80      0.80       952



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,depression,unpopularopinion,lonely,MachineLearning
actual,depression,204,3,5,1
actual,unpopularopinion,5,159,53,14
actual,lonely,5,44,190,9
actual,MachineLearning,8,24,21,207
