## Install/ Import Library

In [None]:
!pip install yfinance
! pip install -U spacy
! python -m spacy download en_core_web_lg

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from pathlib import Path
from tqdm.auto import tqdm
import os
import yfinance
import spacy
import re

## Helper Functions

### load news data file

In [None]:
def load_news_data(directory_files: 'str'):
  directory_path = Path(directory_files)
  directory_files = os.listdir(directory_path)

  file_dict = {}
  for file_name in directory_files:
    if file_name.split('.')[-1] == 'csv':
      try:
        df = pd.read_csv(directory_path / file_name)
        file_dict[file_name] = df
      except:
        pass
    elif file_name.split('.')[-1] == 'json':
      try:
        df = pd.read_json(directory_path / file_name)
        file_dict[file_name] = df
      except:
        try:
          df = pd.read_json(directory_path / file_name, lines = True)
          file_dict[file_name] = df
        except:
          pass
    elif  file_name.split('.')[-1] == 'txt':
      try:
        with open(directory_path / file_name, 'r') as f:
          data = eval(f.read())
          df = pd.DataFrame(data).T
          file_dict[file_name] = df
      except:
        pass
  return file_dict

### load sp500 data

In [None]:
def load_sp500_data(start_date = '2014-12-31', end_date = '2021-01-05'):
  sp = yfinance.Ticker('^GSPC')
  sp_history = sp.history(start =start_date, end = end_date)
  sp_history.reset_index(inplace = True)
  sp_history['Date'] = pd.to_datetime(sp_history['Date'].dt.date)
  sp_history.drop(labels = ['Dividends', 'Stock Splits'], axis = 1, inplace = True)
  sp_history = sp_history.loc[:,['Date', 'Open', 'High', 'Low', 'Volume', 'Close']]
  sp_history['Close+1day'] = sp_history['Close'].shift(-1)
  sp_history.dropna(inplace = True)
  sp_history['up_down'] = sp_history[['Close', 'Close+1day']].apply(lambda x: 1 if x['Close+1day'] > x['Close'] else 0, axis = 1)
  return sp_history

### tokenize & clean data

In [None]:
def tokenization_clean(text: 'str', nlp_model: 'spacy_model'):
  doc = nlp_model(text)
  tok_aft_spacy = [re.sub(r'[^\w\s]', '', tok.lemma_.lower()) for tok in doc 
                   if not tok.is_stop
                   and not tok.is_punct 
                   and not tok.like_num 
                   and not tok.like_url 
                   and not tok.is_space 
                   and not tok.like_email 
                   and not tok.is_left_punct 
                   and not tok.is_right_punct 
                   and not tok.is_digit 
                   and not tok.is_currency]
  
  join_tok_aft_spacy = ' '.join(tok_aft_spacy)
  return join_tok_aft_spacy

### distribution of word counts in each news article

In [None]:
def distribution_word_count(load_news_data_fn: 'function', directory, file_name, tokenization_clean_fn: 'function', nlp_model):
  tqdm.pandas()
  data_dict = load_news_data(directory_files = directory)
  news_data = data_dict[file_name]
  news_data['timestamp'] = pd.to_datetime(news_data['timestamp'])
  news_data['tokenized_text'] = news_data['text'].progress_apply(lambda x: tokenization_clean_fn(x, nlp_model))
  news_data['word_count_text'] = news_data['text'].progress_apply(lambda x: len(x.split(' ')))
  news_data['word_count_tokenized'] = news_data['tokenized_text'].progress_apply(lambda x: len(x.split(' ')))
  news_data['bin_count_text'] = pd.cut(news_data['word_count_text'], [0, 20, 40, 60, 80, 100, 120, 140, 180, 200, 
                                                                      220, 240, 260, 280, 300, 320, 340, 360, 380, 400])
  news_data['bin_count_tokenized'] = pd.cut(news_data['word_count_tokenized'], [0, 20, 40, 60, 80, 100, 120, 140, 180, 200, 
                                                                                220, 240, 260, 280, 300, 320, 340, 360, 380, 400])
  news_data_groupby_1 = news_data[['bin_count_text', 'text']].groupby('bin_count_text', as_index = False).count()
  news_data_groupby_1.rename(columns = {'text': 'text_count'}, inplace = True)
  news_data_groupby_2 = news_data[['bin_count_tokenized', 'text']].groupby('bin_count_tokenized', as_index = False).count()
  news_data_groupby_2.rename(columns = {'text': 'processed text_count'}, inplace = True)
  news_data_groupby = news_data_groupby_1.merge(news_data_groupby_2, how = 'outer', left_on = 'bin_count_text', right_on = 'bin_count_tokenized')
  
  plt.figure(figsize = (10, 5))
  plt.plot(news_data_groupby['bin_count_text'].astype(str), news_data_groupby['text_count'], linewidth = 0.3, color = 'black', label = 'word count before processing')
  plt.plot(news_data_groupby['bin_count_text'].astype(str), news_data_groupby['processed text_count'], linewidth = 0.3, color = 'red', label = 'word count after processing')
  plt.xlabel('bin of word count')
  plt.xticks(rotation = 90)
  plt.ylabel('number of news article')
  plt.title('distribution of words in each article')
  plt.legend()
  plt.show()

  return print(f"min word count before processing: {news_data['word_count_text'].min()}, min word count after processing: {news_data['word_count_tokenized'].min()}")


### distribution of news articles by date (line chart)

In [None]:
def distribution_news_date(load_news_data_fn: 'function', directory, file_name):
  data_dict = load_news_data(directory_files = directory)
  news_data = data_dict[file_name]
  news_data['timestamp'] = pd.to_datetime(news_data['timestamp'])
  news_count_groupby = news_data[['text','timestamp']].groupby('timestamp').count()
  news_count_groupby = news_count_groupby.rename(columns = {'text': 'daily count of news'})
  news_count_groupby.plot(figsize = (16, 6), color = 'black', linewidth = 0.2, label = 'daily count of news')
  plt.ylabel('count of news')
  plt.legend()
  plt.show()

  return news_count_groupby

### distribution of news sentiment (bar chart)

In [None]:
def distribution_news_sent(load_news_data_fn: 'function', directory, file_name):
  data_dict = load_news_data(directory_files = directory)
  news_data = data_dict[file_name]
  news_data['timestamp'] = pd.to_datetime(news_data['timestamp'])
  sent_groupby = news_data[['text','sentiment']].groupby('sentiment').count()
  sent_groupby.rename(columns = {'text': 'count of sentiment'}, inplace = True)

  plt.figure(figsize = (6, 6))
  barchart = plt.bar(sent_groupby.index, sent_groupby['count of sentiment'], width=0.5)
  barchart[0].set_color('#AED6F1')
  barchart[1].set_color('#2E86C1')

  for idx, (i, j) in enumerate(sent_groupby.to_dict()['count of sentiment'].items()):
    plt.annotate('{:0,}'.format(j), xy = (idx - 0.05, j + 1000))
  
  plt.ylim(0, 50000)
  plt.title('Distribution of News Sentiment')
  plt.show()


### distribution of news topics (bar chart)

In [None]:
def distribution_news_topic(load_news_data_fn: 'function', directory, file_name):
  data_dict = load_news_data(directory_files = directory)
  news_data = data_dict[file_name]
  news_data['timestamp'] = pd.to_datetime(news_data['timestamp'])
  topic_groupby = news_data[['text','topics']].groupby('topics').count()
  topic_groupby.rename(columns = {'text': 'count of topics'}, inplace = True)

  plt.figure(figsize = (12, 6))
  barchart = plt.bar(topic_groupby.index, topic_groupby['count of topics'], width=0.7)
  
  for idx, color in zip(range(4), ['#D6EAF8', '#85C1E9', '#3498DB', '#21618C']):
    barchart[idx].set_color(color)

  for idx, (i, j) in enumerate(topic_groupby.to_dict()['count of topics'].items()):
    plt.annotate('{:0,}'.format(j), xy = (idx - 0.08, j + 1000))
  
  plt.ylim(0, 50000)
  plt.title('Distribution of News Topics')
  plt.show()

### trend of daliy overall sentiment  & sp500 close price (line chart)

In [None]:
def distribution_overall_sent(load_news_data_fn: 'function', directory, file_name, sp500_fn: 'function'):
  data_dict = load_news_data(directory_files = directory)
  news_data = data_dict[file_name]
  news_data['timestamp'] = pd.to_datetime(news_data['timestamp'])

  sp500 = sp500_fn()
  sp500 = sp500[sp500['Date'] <= '2019-12-31'].copy()

  df_all_groupby = news_data[['timestamp', 'text', 'sentiment']].groupby(['timestamp', 'sentiment'], as_index = False).count()
  df_all_groupby_norm = pd.pivot(df_all_groupby, index= 'timestamp', columns= 'sentiment', values= 'text').fillna(0).reset_index()
  df_all_groupby_norm['negative_norm'] = (df_all_groupby_norm['negative'] - df_all_groupby_norm['negative'].min())/df_all_groupby_norm['negative'].max()
  df_all_groupby_norm['positive_norm'] = (df_all_groupby_norm['positive'] - df_all_groupby_norm['positive'].min())/df_all_groupby_norm['positive'].max()

  df_all_groupby_norm['negative_weight'] = df_all_groupby_norm['negative']/np.sum(df_all_groupby_norm[['negative', 'positive']], axis = 1)
  df_all_groupby_norm['positive_weight'] = df_all_groupby_norm['positive']/np.sum(df_all_groupby_norm[['negative', 'positive']], axis = 1)

  df_all_groupby_norm['overall_sentiment'] = (df_all_groupby_norm['positive_weight'] * df_all_groupby_norm['positive_norm']*1) \
  - (df_all_groupby_norm['negative_weight'] * df_all_groupby_norm['negative_norm']*1)

  df_all_groupby_norm['overall_sentiment_ma'] = df_all_groupby_norm['overall_sentiment'].rolling(90).mean()

  fig,ax = plt.subplots(figsize = (12, 6))
  ax2=ax.twinx()

  ax.plot(df_all_groupby_norm['timestamp'], df_all_groupby_norm['overall_sentiment_ma'], 
         label = 'overall_sentiment (moving average)', color = 'black', linewidth = 0.2)
  ax.legend()
  ax.set_ylabel('overall sentiment (moving average)')
  
  ax2.plot(sp500['Date'], sp500['Close'], label = 'SP500 Close Price', color = 'red', linewidth = 0.2)
  ax2.legend()
  ax2.set_ylabel('sp500 close price')

  plt.xlabel('date')
  plt.xticks(rotation=90)
  plt.show()

## EDA: News Articles

### initialize spacy model

In [None]:
nlp_model = spacy.load('en_core_web_lg')

### distribution of word counts in each article

In [None]:
distribution_word_count(load_news_data_fn = load_news_data, 
                        directory = '../data/EDA/News Article Text File', 
                        file_name = 'articles_2015_2019.csv', 
                        tokenization_clean_fn = tokenization_clean, 
                        nlp_model = nlp_model)

### distribution of daily news article count

In [None]:
output = distribution_news_date(load_news_data_fn = load_sp500_data, 
                       directory = '../data/EDA/News Article Text File',
                       file_name = 'articles_2015_2019.csv')

### distribution of news sentiment

In [None]:
distribution_news_sent(load_news_data_fn = load_sp500_data, 
                       directory = '../data/EDA/News Article Text File',
                       file_name = 'articles_2015_2019.csv')

### distribution of topics

In [None]:
distribution_news_topic(load_news_data_fn = load_sp500_data, 
                        directory = '../data/EDA/News Article Text File',
                        file_name = 'articles_2015_2019.csv')

### trend of overall sentiment & sp500 close price

In [None]:
distribution_overall_sent(load_news_data_fn = load_sp500_data, 
                          directory = '../data/EDA/News Article Text File',
                          file_name = 'articles_2015_2019.csv', 
                          sp500_fn = load_sp500_data)