In [None]:
# Has all the sentiment functionality. If reading the scraped data from the csv file, use get_all_sentiments_csv else to scrape and generate sentiments, use get_all_sentiments()

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from pathlib import Path

# Import al lthe supporting functions from sentiment_functions.py
from sentiment_functions import score_flair, get_news, get_gnews_article, get_sentiments, getTwitterData,get_flair_score, get_news_flair_score

In [6]:

## Scrape the web for Google news headlies and tweets for a 'key' for 'no_days' days and generate the corresponding sentiment scores
def get_all_sentiments(key, no_days):
    
    # A list of dates from today going back 'no_days' in ascending order
    datelist = pd.bdate_range(end = dt.date(2021, 9, 3), periods = int(no_days))

    # Google News Analysis
    
    summary_df = []

    # iterate through the list of dates to get the headlines for each date
    for date in datelist:
        df = pd.DataFrame(get_news(key, date)) # get_news function reads the RSS feed for google news for the specified date and extracts all the headlines for that date
        
        # We just want to analyze the top 20 headlines for a particular date
        if(len(df) > 20):
            article_df = get_gnews_article(df[:20]) # Using the newspaper package, extract the article, its summary and the keywords   
        else:
            article_df = get_gnews_article(df)
        
        #append the extracted article to a list
        
        summary_df.append(article_df) 
        
    # To convert the list of articles, into a dateframe and set the index to the Date column
    df2 = pd.concat(summary_df).set_index('Date')
    df2 = df2.dropna()
    df2 = df2.reset_index()
    
    # Group the articles by date - The resulting dataframe consists of all the extracted articles for a particular date, in seperate columns
    dic = {}
    for i in range(len(df2)):
        date = df2['Date'][i]
        summary = df2['Article'][i]
        if date in dic:
            dic[date].append(summary)
        else:
            dic[date] = [summary]

    news_df = pd.DataFrame.from_dict(dic, orient = 'index')
    news_df = news_df.reset_index()
    news_df['index'] = pd.to_datetime(news_df['index'])
    
    news_df = news_df.replace(np.nan, 'none')
    news_df = news_df.replace('', 'none')

    # Get sentiment analysis score using vader and textblob for all the articles, agreggated and grouped by the date
    news_sentiment = get_sentiments(news_df)
    
    # Get the flair analysis score for all the articles agreggated and grouped by date
    news_flair_list = get_news_flair_score(news_df)
    
    # concat the sentiments in a new dataframe
    news_sentiment['scores_flair'] = news_flair_list
    
    
    # Twitter Analysis
    
    # Get tweets for the hashtag for no_days days
    tweets_df = getTwitterData(key, no_days) 
    tweets_df = tweets_df.reset_index()
    
    # Convert the tweets dataframe into a new dataframe grouped by date where all the tweets for a particular date and in seperate columns
    tweet_dic = {}
    for i in range(len(tweets_df)):
        date = tweets_df['Date'][i]
        summary = tweets_df['full_text'][i]
        if date in tweet_dic:
            tweet_dic[date].append(summary)
        else:
            tweet_dic[date] = [summary]
        
    twitter_df = pd.DataFrame.from_dict(tweet_dic, orient = 'index')
    twitter_df = twitter_df.reset_index()
    twitter_df['index'] = pd.to_datetime(twitter_df['index'])
    
    twitter_df = twitter_df.replace(np.nan, 'none')
    twitter_df = twitter_df.replace('', 'none')
    
    # Perform Vader and Textblob sentiment analysis on the tweets    
    tweets_sentiment = get_sentiments(twitter_df)    
    tweets_sentiment.columns = ['index', 'tw_subj_score', 'tw_simi_score', 'tw_vader_score']
    
    # Perform flair analysis on the tweets
    flair_list = get_flair_score(twitter_df)
    
    # Concat all the sentiments
    tweets_sentiment['tw_scores_flair'] = flair_list
    
    # Join the google news and twitter sentiments in to new dataframe
    sentiment_df = news_sentiment.join(tweets_sentiment.set_index('index'), on = 'index')
    sentiment_df = sentiment_df.dropna()
    
    # return the generated dataframe with the required signals
    return sentiment_df
    

In [10]:
## Read the already scrapped google news articles and tweets that are stored in csv files to generate the sentiments for a 'key' for 'no_days' days

def get_all_sentiments_csv(key, no_days):
    
    # A list of dates from today going back 'no_days' in ascending order
    datelist = pd.bdate_range(end = dt.date(2021, 9, 3), periods = int(no_days))
    
    # Convert the key to uppercase to match the csv file naming convention
    if key.isupper() == False:
        key = key.upper()

    # Define a path to read the file
    csvpath = Path("Resources/news_data/{}_200.csv".format(key))
    
    #read the file 
    df2 = pd.read_csv(csvpath)
    # df2 = df2.drop(columns = ['Unnamed: 0'])
 
    # Group the read dataframe by date so that each row will contain all the articles for a particular date in different columns
    dic = {}
    for i in range(len(df2)):
        date = df2['Date'][i]
        summary = df2['Article'][i]
        if date in dic:
            dic[date].append(summary)
        else:
            dic[date] = [summary]

    news_df = pd.DataFrame.from_dict(dic, orient = 'index')
    news_df = news_df.reset_index()
    news_df['index'] = pd.to_datetime(news_df['index'])
    
    news_df = news_df.replace(np.nan, 'none')
    news_df = news_df.replace('', 'none')

    
    # Get the vader and textblob sentiments using the entire article
    news_sentiment = get_sentiments(news_df)
    
    # To get the flair score we are just using the summary of the article, so create another dataframe whose rows contain the summary of the articles for a particular date in its columns
    dic2 = {}
    for i in range(len(df2)):
        date = df2['Date'][i]
        summary = df2['Summary'][i]
        if date in dic:
            dic[date].append(summary)
        else:
            dic[date] = [summary]

    news_summary_df = pd.DataFrame.from_dict(dic, orient = 'index')
    news_summary_df = news_summary_df.reset_index()
    news_summary_df['index'] = pd.to_datetime(news_summary_df['index'])
    
    news_summary_df = news_summary_df.replace(np.nan, 'none')
    news_summary_df = news_summary_df.replace('', 'none')
    
    # Get flair scores for the news articcle summary
    news_flair_list = get_news_flair_score(news_summary_df)
    news_sentiment['scores_flair'] = news_flair_list
    
      
    # Read the csv file containg the tweets
    csvpath2 = Path("Resources/news_data/{}_twt_200.csv".format(key))
    tweets_df = pd.read_csv(csvpath2)
    
    tweet_dic = {}
    for i in range(len(tweets_df)):
        date = tweets_df['Date'][i]
        summary = tweets_df['full_text'][i]
        if date in tweet_dic:
            tweet_dic[date].append(summary)
        else:
            tweet_dic[date] = [summary]
        
    twitter_df = pd.DataFrame.from_dict(tweet_dic, orient = 'index')
    twitter_df = twitter_df.reset_index()
    twitter_df['index'] = pd.to_datetime(twitter_df['index'])
    
    twitter_df = twitter_df.replace(np.nan, 'none')
    twitter_df = twitter_df.replace('', 'none')
    
    
    # Get the sentiment data - vader and textblob score for Tweets
    
    tweets_sentiment = get_sentiments(twitter_df)    
    tweets_sentiment.columns = ['index', 'tw_subj_score', 'tw_simi_score', 'tw_vader_score']
    
    # Get the flair score for the tweets
    flair_list = get_flair_score(twitter_df)
    tweets_sentiment['tw_scores_flair'] = flair_list
    
    # Join the two sentiment blocks and return the final sentiment data dataframe.
    sentiment_df = news_sentiment.join(tweets_sentiment.set_index('index'), on = 'index')
    sentiment_df = sentiment_df.dropna()
    
    return sentiment_df
    

In [13]:
%%time

# Example function impementation fro generating the data for a 100 days for the stocks listed in the list.

list = ['AAPL', 'BAC','CRM','GOOG','INTC', 'MSFT','NVDA','PYPL','TSLA']

for i in range(len(list)):
    s_df = get_all_sentiments_csv(list[i],100)
    s_df.to_csv('Resources/Combined Sentiment signals/{}.csv'.format(list(i)))
    s_df

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_sentiment_df['subj_score'] = news_weighted_subj_senti_df.mean(axis = 1, skipna = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_sentiment_df['simi_score'] = news_weighted_simi_senti

Wall time: 28min 53s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,index,subj_score,simi_score,vader_score,scores_flair,tw_subj_score,tw_simi_score,tw_vader_score,tw_scores_flair
0,2020-11-30,0.155822,0.033062,0.428187,0.763038,0.109633,0.178726,0.251724,0.852676
1,2020-12-01,0.079965,-0.092097,0.148340,0.698710,0.107722,0.176904,0.358940,0.903324
2,2020-12-02,0.106608,-0.057984,0.247887,0.711171,0.053968,0.079193,0.136816,0.844240
3,2020-12-03,0.064682,-0.085852,0.156960,0.724906,0.143816,0.306866,0.320792,0.834476
4,2020-12-04,0.118216,-0.116589,0.241673,0.707307,0.049216,-0.048452,0.088508,0.761727
...,...,...,...,...,...,...,...,...,...
193,2021-08-27,0.414973,1.012966,0.958113,0.946642,0.071646,-0.026065,0.169872,0.729967
194,2021-08-30,0.342376,1.026747,0.930040,0.932698,0.070490,0.165825,0.275548,0.847707
195,2021-08-31,0.430080,0.977830,0.964060,0.964923,0.148670,0.220250,0.320300,0.868084
196,2021-09-01,0.396297,0.840783,0.925427,0.934073,0.149433,0.216940,0.347460,0.878803
