In [1]:
# Loading Dependencies
from path import Path
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#load VADER
analyzer = SentimentIntensityAnalyzer()

In [2]:
# Loading Data
data = Path('Resources/reddit.csv')
reddit_df = pd.read_csv(data)
reddit_df.head()

Unnamed: 0,title,score,subreddit,url,num_comments,body,date
0,UPVOTE so everyone sees we got SUPPORT,265029,wallstreetbets,https://i.redd.it/sgoqy8nyt2e61.png,11825,,2021-01-29 00:40:34
1,GME YOLO update — Jan 28 2021,230844,wallstreetbets,https://i.redd.it/opzucppb15e61.png,23532,,2021-01-29 08:06:23
2,CLASS ACTION AGAINST ROBINHOOD. Allowing peopl...,204920,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,18318,LEAVE ROBINHOOD. They dont deserve to make mon...,2021-01-29 00:49:11
3,GME YOLO update — Jan 27 2021 ----------------...,185949,wallstreetbets,https://i.redd.it/a309gkm5yxd61.png,15495,,2021-01-28 08:15:35
4,Can we all take a moment and appreciate the Mo...,184517,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,7105,,2021-01-28 11:57:32


In [3]:
# Checking DTypes
reddit_df.dtypes

title           object
score            int64
subreddit       object
url             object
num_comments     int64
body            object
date            object
dtype: object

In [4]:
# Look at subreddit value counts
subreddit = reddit_df.subreddit.value_counts()
subreddit

investing         987
stocks            985
wallstreetbets    953
Name: subreddit, dtype: int64

In [5]:
# Look at body value counts
body = reddit_df.body.value_counts()
body

Please use this thread to discuss your portfolio, learn of other stock tickers, and help out users by giving constructive criticism.\r\n\r\nWhy quarterly?  Public companies report earnings quarterly; many investors take this as an opportunity to rebalance their portfolios.  We highly recommend you do some reading:  A list of [relevant posts & book recommendations.](https://www.reddit.com/r/stocks/wiki/index#wiki_relevant_posts_.26amp.3B_book_recommendations)\r\n\r\nYou can find stocks on your own by using a scanner like your broker's or [Finviz.](https://finviz.com/screener.ashx)  To help further, here's a list of [relevant websites.](https://www.reddit.com/r/stocks/wiki/index#wiki_relevant_websites.2Fapps)\r\n\r\nIf you don't have a broker yet, see our [list of brokers](https://www.reddit.com/r/stocks/wiki/index#wiki_brokers_for_investing) or search old posts.  If you haven't started investing or trading yet, then setup your [paper trading.](https://www.reddit.com/r/stocks/wiki/index#

In [6]:
# Remove the 'body' column.
reddit_df.drop(['body'], axis=1, inplace=True)
reddit_df.head()

Unnamed: 0,title,score,subreddit,url,num_comments,date
0,UPVOTE so everyone sees we got SUPPORT,265029,wallstreetbets,https://i.redd.it/sgoqy8nyt2e61.png,11825,2021-01-29 00:40:34
1,GME YOLO update — Jan 28 2021,230844,wallstreetbets,https://i.redd.it/opzucppb15e61.png,23532,2021-01-29 08:06:23
2,CLASS ACTION AGAINST ROBINHOOD. Allowing peopl...,204920,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,18318,2021-01-29 00:49:11
3,GME YOLO update — Jan 27 2021 ----------------...,185949,wallstreetbets,https://i.redd.it/a309gkm5yxd61.png,15495,2021-01-28 08:15:35
4,Can we all take a moment and appreciate the Mo...,184517,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,7105,2021-01-28 11:57:32


In [7]:
# Remove rows that have at least 1 null value.
reddit_df.dropna()

Unnamed: 0,title,score,subreddit,url,num_comments,date
0,UPVOTE so everyone sees we got SUPPORT,265029,wallstreetbets,https://i.redd.it/sgoqy8nyt2e61.png,11825,2021-01-29 00:40:34
1,GME YOLO update — Jan 28 2021,230844,wallstreetbets,https://i.redd.it/opzucppb15e61.png,23532,2021-01-29 08:06:23
2,CLASS ACTION AGAINST ROBINHOOD. Allowing peopl...,204920,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,18318,2021-01-29 00:49:11
3,GME YOLO update — Jan 27 2021 ----------------...,185949,wallstreetbets,https://i.redd.it/a309gkm5yxd61.png,15495,2021-01-28 08:15:35
4,Can we all take a moment and appreciate the Mo...,184517,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,7105,2021-01-28 11:57:32
...,...,...,...,...,...,...
2920,DID WE MISS THE BOTTOM?! How are people this i...,348,stocks,https://www.reddit.com/r/stocks/comments/g1m6u...,283,2020-04-15 17:03:23
2921,Favorite Solar Stock(s)?,348,stocks,https://www.reddit.com/r/stocks/comments/jmct3...,176,2020-11-02 10:46:49
2922,"""NIO forms battery asset company""",352,stocks,https://www.reddit.com/r/stocks/comments/id8z3...,102,2020-08-20 23:02:48
2923,Amazon is building a $1.5 billion hub for its ...,349,stocks,https://www.reddit.com/r/stocks/comments/5rghg...,145,2017-02-02 03:35:04


In [8]:
# Checking Sentiment Scores
def sentiment_analyzer_scores(sentence):
    score = analyzer.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))
    
sentiment_analyzer_scores('UPVOTE so everyone sees we got SUPPORT')

UPVOTE so everyone sees we got SUPPORT-- {'neg': 0.0, 'neu': 0.636, 'pos': 0.364, 'compound': 0.5319}


In [9]:
#Add VADER metrics to dataframe
reddit_df['compound'] = [analyzer.polarity_scores(v)['compound'] for v in reddit_df['title']]

reddit_df['neg'] = [analyzer.polarity_scores(v)['neg'] for v in reddit_df['title']]

reddit_df['neu'] = [analyzer.polarity_scores(v)['neu'] for v in reddit_df['title']]

reddit_df['pos'] = [analyzer.polarity_scores(v)['pos'] for v in reddit_df['title']]

reddit_df.head()

Unnamed: 0,title,score,subreddit,url,num_comments,date,compound,neg,neu,pos
0,UPVOTE so everyone sees we got SUPPORT,265029,wallstreetbets,https://i.redd.it/sgoqy8nyt2e61.png,11825,2021-01-29 00:40:34,0.5319,0.0,0.636,0.364
1,GME YOLO update — Jan 28 2021,230844,wallstreetbets,https://i.redd.it/opzucppb15e61.png,23532,2021-01-29 08:06:23,0.4278,0.0,0.679,0.321
2,CLASS ACTION AGAINST ROBINHOOD. Allowing peopl...,204920,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,18318,2021-01-29 00:49:11,-0.5994,0.107,0.893,0.0
3,GME YOLO update — Jan 27 2021 ----------------...,185949,wallstreetbets,https://i.redd.it/a309gkm5yxd61.png,15495,2021-01-28 08:15:35,0.4278,0.0,0.841,0.159
4,Can we all take a moment and appreciate the Mo...,184517,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,7105,2021-01-28 11:57:32,0.6369,0.0,0.794,0.206


In [10]:
reddit_groups = reddit_df.groupby("subreddit")

In [11]:
# Grouping Vader Scores for each Subreddit
reddit_groups.mean()

Unnamed: 0_level_0,score,num_comments,compound,neg,neu,pos
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
investing,1779.078014,424.946302,-0.009373,0.080139,0.842621,0.077234
stocks,1405.035533,311.643655,0.038948,0.060024,0.852797,0.087179
wallstreetbets,29348.439664,2753.270724,0.046143,0.066779,0.834423,0.098805


In [12]:
reddit_df = reddit_df[reddit_df["subreddit"]!="stocks"]
reddit_df.tail()

Unnamed: 0,title,score,subreddit,url,num_comments,date,compound,neg,neu,pos
1935,does anyone here have friends and family still...,898,investing,https://www.reddit.com/r/investing/comments/ef...,609,2019-12-26 05:47:26,-0.0258,0.143,0.717,0.139
1936,Jobs growth soars in November as payrolls surg...,905,investing,https://www.reddit.com/r/investing/comments/e6...,496,2019-12-07 00:41:05,0.3818,0.0,0.776,0.224
1937,"Yale economists argue that ""the most financial...",894,investing,https://www.reddit.com/r/investing/comments/en...,400,2020-01-13 05:35:24,0.046,0.1,0.792,0.108
1938,"Amazon earnings beat: $6.04 per share, vs. $5....",897,investing,https://www.reddit.com/r/investing/comments/al...,152,2019-02-01 08:04:51,0.4404,0.0,0.642,0.358
1939,U.S. stocks plunge after report that former na...,890,investing,https://www.reddit.com/r/investing/comments/7g...,377,2017-12-02 03:34:52,0.34,0.0,0.893,0.107


In [13]:
# Changing Subreddit values to be 1 if from WSB and 0 if not.
reddit_df['subreddit'] = reddit_df['subreddit'].apply(lambda x:1 if x == "wallstreetbets" else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
reddit_df.head()

Unnamed: 0,title,score,subreddit,url,num_comments,date,compound,neg,neu,pos
0,UPVOTE so everyone sees we got SUPPORT,265029,1,https://i.redd.it/sgoqy8nyt2e61.png,11825,2021-01-29 00:40:34,0.5319,0.0,0.636,0.364
1,GME YOLO update — Jan 28 2021,230844,1,https://i.redd.it/opzucppb15e61.png,23532,2021-01-29 08:06:23,0.4278,0.0,0.679,0.321
2,CLASS ACTION AGAINST ROBINHOOD. Allowing peopl...,204920,1,https://www.reddit.com/r/wallstreetbets/commen...,18318,2021-01-29 00:49:11,-0.5994,0.107,0.893,0.0
3,GME YOLO update — Jan 27 2021 ----------------...,185949,1,https://i.redd.it/a309gkm5yxd61.png,15495,2021-01-28 08:15:35,0.4278,0.0,0.841,0.159
4,Can we all take a moment and appreciate the Mo...,184517,1,https://www.reddit.com/r/wallstreetbets/commen...,7105,2021-01-28 11:57:32,0.6369,0.0,0.794,0.206


In [15]:
reddit_df.tail()

Unnamed: 0,title,score,subreddit,url,num_comments,date,compound,neg,neu,pos
1935,does anyone here have friends and family still...,898,0,https://www.reddit.com/r/investing/comments/ef...,609,2019-12-26 05:47:26,-0.0258,0.143,0.717,0.139
1936,Jobs growth soars in November as payrolls surg...,905,0,https://www.reddit.com/r/investing/comments/e6...,496,2019-12-07 00:41:05,0.3818,0.0,0.776,0.224
1937,"Yale economists argue that ""the most financial...",894,0,https://www.reddit.com/r/investing/comments/en...,400,2020-01-13 05:35:24,0.046,0.1,0.792,0.108
1938,"Amazon earnings beat: $6.04 per share, vs. $5....",897,0,https://www.reddit.com/r/investing/comments/al...,152,2019-02-01 08:04:51,0.4404,0.0,0.642,0.358
1939,U.S. stocks plunge after report that former na...,890,0,https://www.reddit.com/r/investing/comments/7g...,377,2017-12-02 03:34:52,0.34,0.0,0.893,0.107


In [16]:
# Creating X and Y sets
y = reddit_df["subreddit"]
X = reddit_df.drop(columns=["subreddit","title","url","date","num_comments","score"])

In [17]:
# Breaking sets into train and test.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(1455, 4)

In [18]:
y.value_counts()

0    987
1    953
Name: subreddit, dtype: int64

In [19]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [20]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [21]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,1,1
4,0,0


In [22]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5092783505154639
