In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from pathlib import Path
from sentiment_functions import score_flair, get_news, get_gnews_article, get_sentiments, getTwitterData,get_flair_score, get_news_flair_score

2022-01-19 20:23:41,245 loading file C:\Users\abhis\.flair\models\sentiment-en-mix-distillbert_4.pt


In [2]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=100,
        max_font_size=30,
        scale=3,
        random_state=1)
   
    wordcloud=wordcloud.generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')

    plt.imshow(wordcloud)
    plt.show()


In [3]:
def get_all_sentiments(key, no_days):
    
    # A list of dates from today going back 'no_days' in ascending order
    datelist = pd.bdate_range(end = dt.date.today(), periods = int(no_days))

    summary_df = []

    # iterate through the list of dates to get the headlines for each date
    for date in datelist:
        df = pd.DataFrame(get_news(key, date))
        if(len(df) > 20):
            article_df = get_gnews_article(df[:20]) 
        else:
            article_df = get_gnews_article(df)
        summary_df.append(article_df)
    
    df2 = pd.concat(summary_df).set_index('Date')
    df2 = df2.dropna()
    df2 = df2.reset_index()
    
    #show_wordcloud(df2['Key_words'])
    
    dic = {}
    for i in range(len(df2)):
        date = df2['Date'][i]
        summary = df2['Article'][i]
        if date in dic:
            dic[date].append(summary)
        else:
            dic[date] = [summary]

    news_df = pd.DataFrame.from_dict(dic, orient = 'index')
    news_df = news_df.reset_index()
    news_df['index'] = pd.to_datetime(news_df['index'])
    
    news_df = news_df.replace(np.nan, 'none')
    news_df = news_df.replace('', 'none')

    news_sentiment = get_sentiments(news_df)
    news_flair_list = get_news_flair_score(news_df)
    news_sentiment['scores_flair'] = news_flair_list
    
    
    
    
    
    tweets_df = getTwitterData(key, no_days) 
    tweets_df = tweets_df.reset_index()
    
    tweet_dic = {}
    for i in range(len(tweets_df)):
        date = tweets_df['Date'][i]
        summary = tweets_df['full_text'][i]
        if date in tweet_dic:
            tweet_dic[date].append(summary)
        else:
            tweet_dic[date] = [summary]
        
    twitter_df = pd.DataFrame.from_dict(tweet_dic, orient = 'index')
    twitter_df = twitter_df.reset_index()
    twitter_df['index'] = pd.to_datetime(twitter_df['index'])
    
    twitter_df = twitter_df.replace(np.nan, 'none')
    twitter_df = twitter_df.replace('', 'none')
    
    
    
    tweets_sentiment = get_sentiments(twitter_df)    
    tweets_sentiment.columns = ['index', 'tw_subj_score', 'tw_simi_score', 'tw_vader_score']
    
    flair_list = get_flair_score(twitter_df)
    tweets_sentiment['tw_scores_flair'] = flair_list
    
    sentiment_df = news_sentiment.join(tweets_sentiment.set_index('index'), on = 'index')
    sentiment_df = sentiment_df.dropna()
    return sentiment_df
    

In [2]:
def get_all_sentiments_csv(key, no_days):
    
    # A list of dates from today going back 'no_days' in ascending order
    datelist = pd.bdate_range(end = dt.date.today(), periods = int(no_days))
    
    if key.isupper() == False:
        key = key.upper()

    csvpath = Path("Resources/news_data/{}.csv".format(key))
    
    df2 = pd.read_csv(csvpath)
    df2 = df2.drop(columns = ['Unnamed: 0'])
 
    dic = {}
    for i in range(len(df2)):
        date = df2['Date'][i]
        summary = df2['Article'][i]
        if date in dic:
            dic[date].append(summary)
        else:
            dic[date] = [summary]

    news_df = pd.DataFrame.from_dict(dic, orient = 'index')
    news_df = news_df.reset_index()
    news_df['index'] = pd.to_datetime(news_df['index'])
    
    news_df = news_df.replace(np.nan, 'none')
    news_df = news_df.replace('', 'none')

    news_sentiment = get_sentiments(news_df)
    
    dic2 = {}
    for i in range(len(df2)):
        date = df2['Date'][i]
        summary = df2['Summary'][i]
        if date in dic:
            dic[date].append(summary)
        else:
            dic[date] = [summary]

    news_summary_df = pd.DataFrame.from_dict(dic, orient = 'index')
    news_summary_df = news_summary_df.reset_index()
    news_summary_df['index'] = pd.to_datetime(news_summary_df['index'])
    
    news_summary_df = news_summary_df.replace(np.nan, 'none')
    news_summary_df = news_summary_df.replace('', 'none')
    
    news_flair_list = get_news_flair_score(news_summary_df)
    news_sentiment['scores_flair'] = news_flair_list
    
    
    
    
    
    csvpath2 = Path("Resources/news_data/{}_twt.csv".format(key))
    tweets_df = pd.read_csv(csvpath2)
    
    tweet_dic = {}
    for i in range(len(tweets_df)):
        date = tweets_df['Date'][i]
        summary = tweets_df['full_text'][i]
        if date in tweet_dic:
            tweet_dic[date].append(summary)
        else:
            tweet_dic[date] = [summary]
        
    twitter_df = pd.DataFrame.from_dict(tweet_dic, orient = 'index')
    twitter_df = twitter_df.reset_index()
    twitter_df['index'] = pd.to_datetime(twitter_df['index'])
    
    twitter_df = twitter_df.replace(np.nan, 'none')
    twitter_df = twitter_df.replace('', 'none')
    
    
    
    tweets_sentiment = get_sentiments(twitter_df)    
    tweets_sentiment.columns = ['index', 'tw_subj_score', 'tw_simi_score', 'tw_vader_score']
    
    flair_list = get_flair_score(twitter_df)
    tweets_sentiment['tw_scores_flair'] = flair_list
    
    sentiment_df = news_sentiment.join(tweets_sentiment.set_index('index'), on = 'index')
    sentiment_df = sentiment_df.dropna()
    
    return sentiment_df
    

In [3]:
%%time
s_df = get_all_sentiments_csv('tsla', 100)
s_df

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_sentiment_df['subj_score'] = news_weighted_subj_senti_df.mean(axis = 1, skipna = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_sentiment_df['simi_score'] = news_weighted_simi_senti

Wall time: 21min 34s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,index,subj_score,simi_score,vader_score,scores_flair,tw_subj_score,tw_simi_score,tw_vader_score,tw_scores_flair
0,2021-08-30,0.220619,0.131578,0.468430,0.829304,0.000481,-0.040284,-0.031200,0.650652
1,2021-08-31,0.298812,0.314914,0.682320,0.850441,0.002748,-0.042609,-0.025293,0.650252
3,2021-09-02,0.228122,0.167491,0.574075,0.833552,0.172077,0.308425,0.351011,0.912994
4,2021-09-03,0.231336,0.112461,0.515755,0.792154,0.076777,-0.045359,0.099300,0.743949
5,2021-09-06,0.087362,-0.097166,0.220845,0.725678,0.107624,0.227190,0.336719,0.890710
...,...,...,...,...,...,...,...,...,...
95,2022-01-10,0.307966,0.518752,0.687940,0.887810,0.190400,0.261481,0.378578,0.825717
96,2022-01-11,0.331322,0.660487,0.769870,0.915610,0.110099,0.199910,0.291567,0.828167
97,2022-01-12,0.348533,0.724604,0.849560,0.903498,0.100748,0.185814,0.255470,0.887907
98,2022-01-13,0.353543,0.655489,0.798300,0.903648,0.131662,0.110299,0.258759,0.872830


In [4]:
s_df.to_csv('Resources/Combined Sentiment signals/{}.csv'.format('TSLA'))