In [15]:
import pandas as pd

In [8]:
def tokenize(s):
    '''
    Input: 
        String
    Output: 
        List of Strings
    
    '''
    return s.split()

def preprocess(s, lower=True, strip_punc=True):
    '''
    Input: String, lower(Bool), strip_punc(Bool)
    Output: List of Strings
    '''
    punc = '.-,?<>:;"\'!%'
    if isinstance(s, str):
        s = tokenize(s)
    if lower:
        s = [t.lower() for t in s]
    if strip_punc:
        s = [t.strip(punc) for t in s]
        
    return s

def token_frequency(tokens, tf= None, relative=False):
    """
    Inputs: 
        tokens = List of Strings or None
        tf = dict or None
        relative = Boolean
    Output: 
        Dictionary of a token frequencies
    """
    tf = {} if tf==None else tf
    
    if len(tf) != 0 and relative==True:
        if isinstance(list(tf.items())[0][1], float):
            print('WARNING: Adding raw counts to relative frequency')
            return tf
        
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    
    if relative:
        total = sum([v for k,v in tf.items()])
        tf = {k:v/total for k, v in tf.items()}
          
    return tf


def calc_percent(headlines):
    total = len(headlines)
    count = 0
    for h in headlines:
        if 'coronavirus' in h:
            count += 1
    return round(count/total * 100, 2)

def word_counts(headlines):
    """
    Input: a list of headlines from news articles
    Output: full word counts for all words from the articles
    """
    # define empty dict to collect all words
    tf = {}
    
    # preprocess all headlines
    headlines = headlines.apply(lambda x: preprocess(x))
    
    # calc token frequency of all word tokens in headlines
    headlines.apply(lambda x: token_frequency(x,tf=tf))
    
    # sort tokens by frequency 
    sorted_tokens = sorted(tf.items(), key= lambda x:x[1], reverse=True)
    for token in sorted_tokens:    #vector = {k:counts[v] for v,k in enumerate(labels)}
        print(token)

In [9]:
# read in wsj articles
df = pd.read_csv('cnbc_news.csv',parse_dates=['publish_date'], index_col='publish_date').reset_index()

# drop any rows with null
df = df.dropna()

df['headline'] = df['headline'].apply(lambda x: preprocess(x))
df.tail()

Unnamed: 0,publish_date,headline
744,2020-03-17,"[analysts, are, starting, to, say, it's, time,..."
745,2020-03-17,"[facebook, announces, $100, million, program, ..."
746,2020-03-17,"[espn, reveals, how, it, will, move, forward, ..."
747,2020-03-17,"[zoom, is, leading, the, shift, to, a, more, v..."
748,2020-03-17,"[coronavirus, live, updates, mnuchin, says, am..."


In [21]:
percent_per_day = df.groupby('publish_date')['headline'].apply(lambda x: calc_percent(x))[4:]
print(percent_per_day)

publish_date
2020-03-08    33.33
2020-03-09    43.14
2020-03-10    43.10
2020-03-11    47.48
2020-03-12    50.83
2020-03-13    58.77
2020-03-14    61.90
2020-03-15    53.33
2020-03-16    52.03
2020-03-17    48.98
Name: headline, dtype: float64


In [11]:
word_counts(df.loc[252:,'headline'])

('coronavirus', 271)
('to', 262)
('the', 147)
('and', 91)
('for', 85)
('in', 80)
('a', 74)
('as', 74)
('says', 72)
('of', 69)
('market', 62)
('trump', 60)
('is', 56)
('on', 46)
('from', 45)
('new', 42)
('stocks', 38)
('us', 38)
('are', 37)
('stock', 36)
('live', 36)
('at', 31)
('will', 30)
('updates', 29)
('amid', 26)
('due', 25)
('after', 24)
('with', 24)
('more', 24)
('be', 24)
('travel', 23)
('could', 23)
('it', 20)
('up', 20)
('outbreak', 20)
('down', 20)
('house', 20)
('fed', 19)
('help', 19)
('can', 19)
('york', 18)
('how', 18)
('over', 18)
('by', 18)
('not', 18)
("here's", 17)
('home', 17)
('cases', 17)
('its', 17)
('all', 16)
('recession', 16)
('dow', 16)
('some', 16)
('pandemic', 15)
('crisis', 15)
('may', 14)
('have', 14)
('buy', 14)
('white', 14)
('work', 13)
('ban', 13)
('markets', 13)
('you', 13)
('these', 13)
('billion', 13)
('what', 13)
('mnuchin', 13)
('this', 13)
('tests', 12)
('employees', 12)
('30', 12)
('amazon', 12)
('but', 12)
('now', 12)
('bill', 11)
('first', 11

('regeneron', 1)
('questions', 1)
('experts', 1)
('answers', 1)
('regulators', 1)
('loosening', 1)
('volume', 1)
('monitor', 1)
('laptop', 1)
('comfortably', 1)
('l', 1)
('brands', 1)
('eagle', 1)
('withdraw', 1)
('outlooks', 1)
('tapping', 1)
('seeks', 1)
('surpass', 1)
('5,000', 1)
('fivefold', 1)
('ago', 1)
('those', 1)
('taxes', 1)
('immediately', 1)
('1,000', 1)
("firm's", 1)
('dips', 1)
('sell', 1)
('pops', 1)
('tom', 1)
('brady', 1)
('patriots', 1)
('blank', 1)
('$39', 1)
('buybacks', 1)
('unions', 1)
('deadline', 1)
('90', 1)
('$850', 1)
('damage', 1)
('ultimate', 1)
('secret', 1)
('detail', 1)
('learn', 1)
('boldest', 1)
('viacomcbs', 1)
('walt', 1)
('television', 1)
('media', 1)
('run', 1)
('psas', 1)
('ex-trump', 1)
('mick', 1)
('mulvaney', 1)
('self-quarantining', 1)
('quarantined', 1)
('cannot', 1)
('thing', 1)
('enlist', 1)
('coordinated', 1)
('forecast', 1)
('crude', 1)
('$22', 1)
('per', 1)
('barrel', 1)
('doordash', 1)
('eliminates', 1)
('swedish', 1)
('pension', 1)
('