In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np

In [2]:
def tokenize(s):
    '''
    Input: 
        String
    Output: 
        List of Strings
    
    '''
    return s.split()

def preprocess(s, lower=True, strip_punc=True):
    '''
    Input: String, lower(Bool), strip_punc(Bool)
    Output: List of Strings
    '''
    punc = '.-,?<>:;"\'!%'
    if isinstance(s, str):
        s = tokenize(s)
    if lower:
        s = [t.lower() for t in s]
    if strip_punc:
        s = [t.strip(punc) for t in s]
        
    return s

def token_frequency(tokens, tf= None, relative=False):
    """
    Inputs: 
        tokens = List of Strings or None
        tf = dict or None
        relative = Boolean
    Output: 
        Dictionary of a token frequencies
    """
    tf = {} if tf==None else tf
    
    if len(tf) != 0 and relative==True:
        if isinstance(list(tf.items())[0][1], float):
            print('WARNING: Adding raw counts to relative frequency')
            return tf
        
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    
    if relative:
        total = sum([v for k,v in tf.items()])
        tf = {k:v/total for k, v in tf.items()}
          
    return tf


def calc_percent(headlines):
    total = len(headlines)
    count = 0
    for h in headlines:
        if 'coronavirus' in h:
            count += 1
    return round(count/total * 100, 2)

In [3]:
# read in wsj articles
df = pd.read_csv('cnbc_news.csv',parse_dates=['publish_date'], index_col='publish_date').reset_index()

# drop any rows with null
df = df.dropna()

df['headline'] = df['headline'].apply(lambda x: preprocess(x))
df.tail()

Unnamed: 0,publish_date,headline
240,2020-03-11,"[coronavirus, fallout, at, least, 150, compani..."
241,2020-03-11,"[fed, boosts, money, it's, providing, to, bank..."
242,2020-03-11,"[10-year, treasury, yield, rises, slightly, in..."
243,2020-03-11,"[putin, takes, a, big, step, closer, to, being..."
244,2020-03-11,"[white, house, task, force, recommends, anti-c..."


In [4]:
df.groupby('publish_date')['headline'].apply(lambda x: calc_percent(x))

publish_date
2020-02-25      0.00
2020-03-02    100.00
2020-03-08     33.33
2020-03-09     43.14
2020-03-10     43.10
2020-03-11     46.92
Name: headline, dtype: float64

In [5]:
total = df.shape[0]
count = 0
for i in range(total):
    if 'coronavirus' in df.loc[i,'headline']:
        count += 1
round(count/total * 100, 2)

45.31