## Reddit Data 

In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle
import praw
import psaw
import datetime as dt
import re 
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
from textblob import TextBlob
import operator
import matplotlib.pyplot as plt
import tqdm as tqdm

## Pushshift API

In [2]:
def push_shift(subreddit):
    """
    Function that runs pushshift API to get post between certain period of time for reddit
    """
    api = psaw.PushshiftAPI()

    result = api.search_submissions(
    after=int(dt.datetime(2019,1,1).timestamp()),
    before=int(dt.datetime(2019,4,1).timestamp()),
    subreddit=subreddit)

    posts = list(result)

    return posts
    #print('Retrieved {} posts'.format(len(posts)))

## Reddit API

In [4]:
# Intializing reddit API
reddit = praw.Reddit(client_id = 'uAnf2MO4t3y5Sg',
                     client_secret = 'kKf41wK8vOGMv3-YUKls5iqhrqE',
                     username = 'white_wary_whale',
                     password  = 'smcmtogo',
                     user_agent = 'stock_subreddit_analysis')

## Dataframe Creation 

In [5]:
def subreddit_df(posts):
    """
   
    Function that creates a dataframe from the information from pushshift and Reddit API
    """
    sub_list = []
    for i in range(len(posts)):
        title = posts[i].title
        post_id = posts[i].id
        com_num = posts[i].num_comments
        date_posted =  dt.datetime.fromtimestamp(int(posts[i].created)).strftime('%Y-%m-%d')
        subm = reddit.submission(posts[i].id)
        upvote_ratio = subm.upvote_ratio
        score = subm.score
        day_of_week = dt.datetime.fromtimestamp(int(posts[i].created)).weekday()
        comb_list = [title,post_id, com_num, date_posted, upvote_ratio, score, day_of_week]
        sub_list.append(comb_list)
        
    sub_df = pd.DataFrame(sub_list, columns = ['Title','ID','Number_of_Comments','Date_Posted','Upvote_ratio','Score', 'Day_of_Week'])
    
    return sub_df

In [6]:
# Uses subrredit_df func to create dataframe for apple subreddit
apple_df = subreddit_df('apple')

AttributeError: 'str' object has no attribute 'id'

In [7]:
# took data in chunks this is concating all df
# appl1 = pd.read_csv('Apple_2017_0.csv')
# appl2 = pd.read_csv('Apple_2017_1.csv')
# appl3 = pd.read_csv('Apple_2017_2.csv')
# appl4 = pd.read_csv('Apple_2017_3.csv')
# appl5 = pd.read_csv('Apple_2018_2.csv')
# appl6 = pd.read_csv('Apple_2018_1_0.csv')
# appl7 = pd.read_csv('Apple_2018_1_1.csv')
# apple1 = pd.read_csv('Apple_2019_1.csv')
# apple2 = pd.read_csv('Apple_2019_2.csv')
# apple3 = pd.read_csv('Apple_2019_3.csv')
# apple4 = pd.read_csv('Apple_2019_4.csv')
# apple5 = pd.read_csv('Apple_6.csv')

# apple_df = pd.concat([apple5, apple4, apple3, apple2, apple1, appl7, appl6,appl5,appl4, appl3,appl2,appl1])

In [13]:
# Sorts of post that had some interations and isolates top 10 daily posts with most interaction
holder_df = apple_df
holder_df['Total_Interactions'] = holder_df.Number_of_Comments + (holder_df.Score/ holder_df.Upvote_ratio)
df_apple1 = holder_df[holder_df['Total_Interactions'] >= 1]
df_apple1 = df_apple1.sort_values(['Date_Posted','Total_Interactions'], ascending=False, inplace=False).groupby('Date_Posted').head(10)
df_apple1.reset_index(inplace = True, drop = True)

In [18]:
#creates csv for topic moeling notebook
df_apple1.to_csv('Apple_com_df.csv', index= False)

### All Code after this point is unused. I just kept it for later reference

## Text Cleaning 

In [71]:
def text_cleaning_list(list_of_text):
    text = []
    for i in range(len(list_of_text)):
        cleaning = re.sub('[%s]' % re.escape(string.punctuation),' ', list_of_text[i])
        cleaning = cleaning.lower()
        cleaning = re.sub('\w*\d\w*', ' ', cleaning)
        text.append(cleaning)
        
    return text

In [72]:
def text_cleaning_string(strings):
    cleaning = re.sub('[%s]' % re.escape(string.punctuation),' ', strings)
    cleaning = cleaning.lower()
    cleaning = re.sub('\w*\d\w*', ' ', cleaning)

    return cleaning

## Topic Analysis of titles

In [73]:
def lsa_topic(list_of_text, topic_number, sub_dataframe):
    
    text = text_cleaning_list(list_of_text)
    
    cv = CountVectorizer(stop_words='english')
    
    X = cv.fit_transform(text)
    vec_df = pd.DataFrame(X.toarray(), index=text, columns = cv.get_feature_names())
    
    lsa = TruncatedSVD(topic_number)
    doc_topic = lsa.fit_transform(X)
    
    Vt = pd.DataFrame(doc_topic.round(5),
             index = text)#,
             #columns = ["component_1","component_2","component_3","component_4" ])
   
    topics = Vt.idxmax(axis=1)
    topics.reset_index(drop = True, inplace = True)
    sub_dataframe['Topic'] = topics.tolist()
    
    return sub_dataframe

In [53]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [259]:
def topic_top(list_of_text):
    text = text_cleaning(list_of_text)
    
    cv = CountVectorizer(stop_words='english')
    X = cv.fit_transform(text)
    
    lsa = TruncatedSVD(10)
    lsa.fit_transform(X)
    
    output = display_topics(lsa, cv.get_feature_names(), 10)
    
    return output

In [262]:
apple_df_titles = apple_df.Title
title_list = apple_df_titles.tolist()
apple_df_topic = lsa_topic(title_list, 10, apple_df)
topic_viz = topic_top(title_list)


Topic  0
apple, iphone, pro, new, macbook, ipad, watch, app, ios, tv

Topic  1
pro, iphone, macbook, ipad, new, se, air, airpods, vs, mac

Topic  2
iphone, se, xr, xs, ios, phone, max, battery, plus, camera

Topic  3
app, ipad, new, ios, store, mac, free, keyboard, apps, magic

Topic  4
macbook, app, new, air, ios, store, free, help, just, download

Topic  5
new, ipad, air, macbook, keyboard, se, ios, magic, year, features

Topic  6
airpods, new, pro, mac, ios, max, case, phone, airpod, free

Topic  7
ios, mac, apps, update, beta, support, macos, screen, video, help

Topic  8
mac, help, video, free, download, need, mini, converter, os, photo

Topic  9
help, airpods, need, ipad, air, phone, case, just, macbook, icloud


In [263]:
apple_df_topic

Unnamed: 0,Title,ID,Number_of_Comments,Date_Posted,Upvote_ratio,Score,Day_of_Week,Total_Interactions,Topic
0,After over 10 years on Google/Android... I'm c...,hj57ne,0,2020-07-01,1.0,1,2,1.0,7
1,Can't Transfer PDF Files To iPad Using iTunes,hj57du,0,2020-07-01,1.0,1,2,1.0,3
2,How do I share individual songs from my librar...,hj52l4,0,2020-07-01,1.0,1,2,1.0,3
3,I created a DIY iPhone box wireless charger! T...,hj4rhb,5,2020-07-01,0.3,0,2,5.0,0
4,"I need help, urgently.",hj4obh,0,2020-07-01,1.0,1,2,1.0,9
...,...,...,...,...,...,...,...,...,...
32186,Please help meh,eifz2w,0,2020-01-01,1.0,1,2,1.0,9
32187,Buy Apple All-New Airpods in very Cheap Price,eifywu,1,2020-01-01,1.0,1,2,2.0,0
32188,iPhone 11 Pro 4K - low quality,eifysv,0,2020-01-01,1.0,1,2,1.0,1
32189,Question About apples Trade In,eifyrj,0,2020-01-01,1.0,1,2,1.0,9


In [264]:
gol = apple_df_topic
df_apple1 = gol[gol['Total_Interactions'] >= 1]
df_apple1 = df_apple1.sort_values(['Date_Posted','Number_of_Comments'], ascending=False, inplace=False).groupby('Date_Posted').head(5)
df_apple1.reset_index(inplace = True, drop = True)
df_apple1

Unnamed: 0,Title,ID,Number_of_Comments,Date_Posted,Upvote_ratio,Score,Day_of_Week,Total_Interactions,Topic
0,Here is everything Apple killed — or tried to ...,hj37hv,90,2020-07-01,0.66,44,2,156.666667,0
1,Apple closes all retail stores in the Dallas a...,hj06l7,8,2020-07-01,0.89,75,2,92.269663,0
2,I created a DIY iPhone box wireless charger! T...,hj4rhb,5,2020-07-01,0.30,0,2,5.000000,0
3,iPhone 11 setup help,hizv3f,3,2020-07-01,0.75,4,2,8.333333,2
4,After over 10 years on Google/Android... I'm c...,hj57ne,0,2020-07-01,1.00,1,2,1.000000,7
...,...,...,...,...,...,...,...,...,...
910,Tim Cook on Twitter: “There is opportunity in ...,eil3x7,507,2020-01-01,0.87,2733,2,3648.379310,5
911,"Diet tracking app ‘Calory’ now tracks water, s...",eii22x,123,2020-01-01,0.90,610,2,800.777778,3
912,Apple should Group verification code messages ...,eil373,63,2020-01-01,0.94,625,2,727.893617,0
913,Daily Tech Support Thread - [January 01],eijav3,63,2020-01-01,0.80,9,2,74.250000,7


## Sentiment Analysis

In [58]:
def text_blob(text):
    
    read = TextBlob(text)
    sentiment = read.sentiment
    pos_neg = sentiment[0]
    return pos_neg

In [77]:
def sentiment_analysis(id_list, df):
    
    comments_list = []
    for ids in id_list:
        submission = reddit.submission(ids) #'eityg8'
        submission.comments.replace_more()
        comments = submission.comments.list()
        comments_list.append(comments)
    
    sums = []
    for comments in comments_list:
        com = []
        sens = []
        for count, comment in enumerate(comments):
            clean_string = text_cleaning_string(comments[count].body)
            com.append(clean_string)
            sens.append(text_blob(clean_string))
            #sens1.append(sentiment_scores(comments[count].body))
        
        summed = sum(sens)/len(comments)
        sums.append(summed)
    
    df['Sentiment'] = sums
    return df

In [99]:
id_df = apple_df_topic.ID
id_list = id_df.tolist()
apple_df_topic_sent = sentiment_analysis(id_list, apple_df_topic)

In [79]:
apple_df_topic_sent

Unnamed: 0,Title,ID,Number_of_Comments,Date_Posted,Score,Day_of_Week,Topic,Sentiment
0,Dell will soon let you interact with your iPho...,ej8dyw,261,2020-01-03,580,4,2,0.112162
1,If Apple released a classic OS with new featur...,ejb475,40,2020-01-03,0,4,2,-0.006666
2,iMessage from a PC? Yup. If you have a Dell,ejb7kk,8,2020-01-03,15,4,2,-0.133333
3,"Apple rumored to be making $5,000 gaming PC",ejb3xp,8,2020-01-03,0,4,2,-0.068125
4,Upcoming iPhones to be powered by A14 5nm Proc...,ejbrux,2,2020-01-03,1,4,3,-0.125
5,"Siri has started saying “Hong Kong SAR, China”...",eixt7h,473,2020-01-02,1,3,1,0.044043
6,"AAPL closes in on $300, 1/3 of the way to its ...",ej3k6y,279,2020-01-02,472,3,0,0.175411
7,Apple is likely to launch as many as 6 iPhone ...,eixav8,165,2020-01-02,1,3,2,0.114795
8,App Store dominates Google Play on Christmas D...,ej3rew,114,2020-01-02,188,3,3,0.149033
9,I’m glad Apple doesn’t allow full screen ads f...,eiyhty,109,2020-01-02,1,3,2,0.037037
