In [1]:
#import necessary packages
import pandas as pd
import numpy as np
import tensorflow as tf
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
#load in dataset
doctopics = pd.read_csv('./doc-topics.csv')

In [3]:
#convert dataset to numerical data
doctopics['docname'] = doctopics['docname'].apply(lambda x: int(x.split(':')[-1]))
doctopics['topic'] = doctopics['topic'].apply(lambda x: int(x))
doctopics['proportion'] = doctopics['proportion'].apply(lambda x: float(x))
doctopics.head()

Unnamed: 0,docname,topic,proportion
0,14,7,0.358951
1,14,52,0.209518
2,14,0,0.183793
3,14,20,0.077304
4,14,38,0.071314


In [5]:
#group topic and proportion data under one docname
dt = doctopics.groupby('docname', as_index = False).agg(lambda x: list(x))

In [6]:
dt.head()

Unnamed: 0,docname,topic,proportion
0,0,[0],[1.0]
1,1,"[44, 45, 0, 15, 19, 3, 18, 4, 17]","[0.279039, 0.15739, 0.119999, 0.113728, 0.1103..."
2,2,[18],[1.0]
3,3,"[6, 4, 9, 2, 30, 28, 52]","[0.283012, 0.19584400000000002, 0.187111, 0.16..."
4,4,"[66, 0, 65, 37, 45]","[0.309297, 0.25146599999999997, 0.198040000000..."


In [7]:
#upload articles dataset
articles = pd.read_csv('./all_apple_news.csv')

In [11]:
#rename headers
article1 = [articles.columns[1]]+list(articles.iloc[:,1])
article0 = [articles.columns[0]]+list(articles.iloc[:,0])
d = {'article':article1, 'date':article0}
apple_filtered = pd.DataFrame(data = d)
apple_filtered = apple_filtered.drop(axis=0, labels=0)
apple_filtered.head()

Unnamed: 0,article,date
1,Apple shouldn't worry about Google's $1.1B HTC...,2017-09-21 00:00:00
2,"Apple education event: How to watch live, star...",2018-03-27 10:15:01
3,Apple adds the word 'Services' to its official...,2017-08-03 14:41:40
4,Apple refurbished iPhone: How to get a deal on...,2016-11-08 20:06:23
5,Apple's iMac Pro accessories surface on eBay w...,2017-12-26 19:30:47


In [17]:
analyzer = SentimentIntensityAnalyzer()

In [18]:
numTopics = 100

def getText(docname):
    '''
    input - docname from a row of dt
    output - text from that row of the other pd
    '''
    text = (apple_filtered.iloc[docname])['article']
    return(text)
    
def weights(docname, topics, proportions):
    '''
    input - docname, topic, proportion from a row
    output - a dictionary containing all 100 topics as keys and sentiments as values
    '''
    output_dict = {}
    sentiment = analyzer.polarity_scores(getText(docname))
    for i in topics:
        output_dict[i] = sentiment['compound']*proportions[topics.index(i)]
    for n in range(0,numTopics):
        if not n in output_dict:
            output_dict[n] = 0
    return(output_dict)

In [19]:
output_d = {}
dates = []
docname_l = list(dt.docname)
topic_l = list(dt.topic)
proportion_l = list(dt.proportion)

In [20]:
print(len(topic_l))
print(len(apple_filtered.date))

10490
10383


In [21]:
#iterate through dataframe rows, calculate text sentiments of each article, multiply by weights
for i in range(len(apple_filtered.date)):
    try:
        dates.append((apple_filtered.iloc[docname_l[i]])['date'])
        w = weights(docname_l[i], topic_l[i], proportion_l[i])
        for j in range(numTopics):
            try:
                output_d[j].append(w[j])
            except KeyError:
                output_d[j] = [w[j]]
                
    except Exception as e:
        print(e)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000


In [22]:
for i in output_d.keys():
    dt[i] = pd.Series(output_d[i])
dt.head()

Unnamed: 0,docname,topic,proportion,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
0,0,[0],[1.0],0.3412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,"[44, 45, 0, 15, 19, 3, 18, 4, 17]","[0.279039, 0.15739, 0.119999, 0.113728, 0.1103...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,[18],[1.0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,"[6, 4, 9, 2, 30, 28, 52]","[0.283012, 0.19584400000000002, 0.187111, 0.16...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,"[66, 0, 65, 37, 45]","[0.309297, 0.25146599999999997, 0.198040000000...",-0.073529,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
from datetime import datetime, date

In [24]:
for i in range(len(dates)):
    if isinstance(dates[i], datetime):
        dates[i] = dates[i].date()
    else:
        dates[i] = datetime.strptime(dates[i][:10], '%Y-%m-%d').date()

In [25]:
dt['date'] = pd.Series(dates)
dt.head()

Unnamed: 0,docname,topic,proportion,0,1,2,3,4,5,6,...,91,92,93,94,95,96,97,98,99,date
0,0,[0],[1.0],0.3412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-21
1,1,"[44, 45, 0, 15, 19, 3, 18, 4, 17]","[0.279039, 0.15739, 0.119999, 0.113728, 0.1103...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-03-27
2,2,[18],[1.0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-08-03
3,3,"[6, 4, 9, 2, 30, 28, 52]","[0.283012, 0.19584400000000002, 0.187111, 0.16...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-11-08
4,4,"[66, 0, 65, 37, 45]","[0.309297, 0.25146599999999997, 0.198040000000...",-0.073529,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-12-26


In [27]:
dt1 = dt.copy(deep=True)
dt1.head()

Unnamed: 0,docname,topic,proportion,0,1,2,3,4,5,6,...,91,92,93,94,95,96,97,98,99,date
0,0,[0],[1.0],0.3412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-21
1,1,"[44, 45, 0, 15, 19, 3, 18, 4, 17]","[0.279039, 0.15739, 0.119999, 0.113728, 0.1103...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-03-27
2,2,[18],[1.0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-08-03
3,3,"[6, 4, 9, 2, 30, 28, 52]","[0.283012, 0.19584400000000002, 0.187111, 0.16...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-11-08
4,4,"[66, 0, 65, 37, 45]","[0.309297, 0.25146599999999997, 0.198040000000...",-0.073529,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-12-26


In [28]:
dt1 = dt1.drop('proportion', axis=1)
dt1 = dt1.drop('topic',axis=1)
dt1 = dt1.drop('docname', axis=1)
dt1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,date
0,0.3412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-21
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-03-27
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-08-03
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-11-08
4,-0.073529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-12-26


In [29]:
dt2 = dt1.groupby('date', as_index=False).agg(lambda x: sum(x))

In [30]:
dt2.head()

Unnamed: 0,date,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,2016-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2016-01-04,0.034895,0.006244,0.0,0.0,0.0,0.019073,0.0,0.007601,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2016-01-05,0.0,0.0,0.0,-0.049886,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2016-01-06,0.00347,0.0,0.00342,0.095066,0.0,0.076383,0.024897,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2016-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
dt2.size

136552

In [32]:
dt2.to_csv('./textsent.csv')