In [1]:
import pandas as pd

In [2]:
def tokenize(s):
    '''
    Input: 
        String
    Output: 
        List of Strings
    
    '''
    return s.split()

def preprocess(s, lower=True, strip_punc=True):
    '''
    Input: String, lower(Bool), strip_punc(Bool)
    Output: List of Strings
    '''
    punc = '.-,?<>:;"\'!%'
    if isinstance(s, str):
        s = tokenize(s)
    if lower:
        s = [t.lower() for t in s]
    if strip_punc:
        s = [t.strip(punc) for t in s]
        
    return s

def token_frequency(tokens, tf= None, relative=False):
    """
    Inputs: 
        tokens = List of Strings or None
        tf = dict or None
        relative = Boolean
    Output: 
        Dictionary of a token frequencies
    """
    tf = {} if tf==None else tf
    
    if len(tf) != 0 and relative==True:
        if isinstance(list(tf.items())[0][1], float):
            print('WARNING: Adding raw counts to relative frequency')
            return tf
        
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    
    if relative:
        total = sum([v for k,v in tf.items()])
        tf = {k:v/total for k, v in tf.items()}
          
    return tf


def calc_percent(headlines):
    total = len(headlines)
    count = 0
    for h in headlines:
        if 'coronavirus' in h:
            count += 1
    return round(count/total * 100, 2)

def word_counts(headlines):
    """
    Input: a list of headlines from news articles
    Output: full word counts for all words from the articles
    """
    # define empty dict to collect all words
    tf = {}
    
    # preprocess all headlines
    headlines = headlines.apply(lambda x: preprocess(x))
    
    # calc token frequency of all word tokens in headlines
    headlines.apply(lambda x: token_frequency(x,tf=tf))
    
    # sort tokens by frequency 
    sorted_tokens = sorted(tf.items(), key= lambda x:x[1], reverse=True)
    for token in sorted_tokens:    #vector = {k:counts[v] for v,k in enumerate(labels)}
        print(token)

In [3]:
# read in wsj articles
df = pd.read_csv('cnbc_news.csv',parse_dates=['publish_date'], index_col='publish_date').reset_index()

# drop any rows with null
df = df.dropna()

df['headline'] = df['headline'].apply(lambda x: preprocess(x))
df.tail()

Unnamed: 0,publish_date,headline
1879,2020-03-28,"[review, bmw, m850i, is, a, competent, choice,..."
1880,2020-03-28,"[the, century-old, company, that, invented, th..."
1881,2020-03-28,"[op-ed, coronavirus, will, hit, emerging, mark..."
1882,2020-03-28,"[trump, considering, enforceable, quarantine, ..."
1883,2020-03-28,"[coronavirus, live, updates, shellshocked, spa..."


In [4]:
percent_per_day = df.groupby('publish_date')['headline'].apply(lambda x: calc_percent(x))[4:]
print(percent_per_day)

publish_date
2020-03-08    33.33
2020-03-09    43.14
2020-03-10    43.10
2020-03-11    47.83
2020-03-12    50.83
2020-03-13    59.29
2020-03-14    61.90
2020-03-15    53.33
2020-03-16    53.23
2020-03-17    44.36
2020-03-18    44.32
2020-03-19    52.27
2020-03-20    51.30
2020-03-21    69.77
2020-03-22    62.50
2020-03-23    55.74
2020-03-24    50.34
2020-03-25    48.39
2020-03-26    44.35
2020-03-27    53.92
2020-03-28    50.00
Name: headline, dtype: float64


In [5]:
word_counts(df.loc[252:,'headline'])

('coronavirus', 850)
('to', 760)
('the', 499)
('in', 324)
('for', 305)
('and', 287)
('of', 256)
('a', 245)
('says', 243)
('as', 229)
('is', 181)
('market', 149)
('on', 149)
('us', 137)
('trump', 131)
('new', 128)
('stocks', 122)
('are', 115)
('live', 113)
('stock', 112)
('from', 103)
('after', 102)
('will', 101)
('updates', 96)
('more', 95)
('stimulus', 93)
('bill', 88)
('dow', 86)
('with', 84)
('at', 75)
('be', 75)
('up', 74)
('cases', 72)
('amid', 71)
('could', 69)
('it', 66)
('pandemic', 66)
('by', 61)
('outbreak', 58)
('how', 57)
('due', 56)
('york', 56)
("here's", 54)
('—', 52)
('this', 52)
('down', 50)
('than', 50)
('what', 50)
('billion', 50)
('now', 50)
('crisis', 49)
('its', 49)
('can', 48)
('not', 48)
('has', 47)
('ceo', 46)
('but', 46)
('during', 45)
('have', 44)
('house', 44)
('help', 42)
('may', 42)
('trillion', 41)
('buy', 41)
('home', 40)
('these', 40)
('relief', 40)
('fed', 38)
('amazon', 37)
('all', 37)
('you', 37)
('workers', 37)
('say', 37)
('get', 35)
('they', 34)
(

('chloroquine', 2)
('un', 2)
('thrive', 2)
('smartphone', 2)
('martin', 2)
('caps', 2)
('propose', 2)
('financially', 2)
('procedures', 2)
('did', 2)
('collapsing', 2)
('fix', 2)
('tide', 2)
('tumbles', 2)
('statewide', 2)
('extended', 2)
('doubling', 2)
('voting', 2)
('choice', 2)
('recent', 2)
('fixed', 2)
('indiana', 2)
('boosts', 2)
('tellers', 2)
('phishing', 2)
('prey', 2)
('systems', 2)
('bankruptcy', 2)
('dan', 2)
('niles', 2)
('coca-cola', 2)
('clearing', 2)
('unable', 2)
('completely', 2)
('her', 2)
('$30,000', 2)
('deploy', 2)
('network', 2)
('vaccines', 2)
('jc', 2)
('penney', 2)
('yanks', 2)
('abbott', 2)
('labs', 2)
('expanding', 2)
('docs', 2)
('rent', 2)
('fighting', 2)
('upgraded', 2)
('slew', 2)
('globe', 2)
('post-coronavirus', 2)
('commit', 2)
('kraft', 2)
('heinz', 2)
('central', 2)
('tariff', 2)
('allies', 2)
('together', 2)
('bmw', 2)
('grants', 2)
('minutes', 2)
('oks', 2)
('safe-haven', 2)
('read', 2)
('sent', 2)
("company's", 2)
('mass', 2)
('adapt', 2)
('staf

('ok', 1)
('forbidding', 1)
('condition', 1)
('ekes', 1)
('190-point', 1)
('clears', 1)
('minions', 1)
('gru', 1)
('compensation', 1)
('stoppage', 1)
('unequivocal', 1)
('equities', 1)
('impossible', 1)
('fragile', 1)
('economies', 1)
('bracing', 1)
('perfect', 1)
('sense', 1)
('environment', 1)
("he'd", 1)
('fat', 1)
('cats', 1)
('expense', 1)
('ambassador', 1)
('nikki', 1)
('haley', 1)
('resigns', 1)
('opposing', 1)
('math', 1)
('discuss', 1)
('regional', 1)
('compass', 1)
('presents', 1)
('insurmountable', 1)
('obstacles', 1)
('legend', 1)
('bono', 1)
('livestreamed', 1)
('adults', 1)
('overtakes', 1)
('$10,000', 1)
('feds', 1)
('middle', 1)
('class', 1)
('bail', 1)
('greedy', 1)
('dentists', 1)
('heads', 1)
('upset', 1)
('refunding', 1)
('covering', 1)
('predict', 1)
('performance', 1)
('pillars', 1)
('reelection', 1)
('flawed', 1)
('temperatures', 1)
('bucs', 1)
('pundits', 1)
('clashes', 1)
('playing', 1)
('style', 1)
('culture', 1)
('propping', 1)
('federation', 1)
('pet', 1)
('