In [1]:
import pandas as pd

In [2]:
def tokenize(s):
    '''
    Input: 
        String
    Output: 
        List of Strings
    
    '''
    return s.split()

def preprocess(s, lower=True, strip_punc=True):
    '''
    Input: String, lower(Bool), strip_punc(Bool)
    Output: List of Strings
    '''
    punc = '.-,?<>:;"\'!%'
    if isinstance(s, str):
        s = tokenize(s)
    if lower:
        s = [t.lower() for t in s]
    if strip_punc:
        s = [t.strip(punc) for t in s]
        
    return s

def token_frequency(tokens, tf= None, relative=False):
    """
    Inputs: 
        tokens = List of Strings or None
        tf = dict or None
        relative = Boolean
    Output: 
        Dictionary of a token frequencies
    """
    tf = {} if tf==None else tf
    
    if len(tf) != 0 and relative==True:
        if isinstance(list(tf.items())[0][1], float):
            print('WARNING: Adding raw counts to relative frequency')
            return tf
        
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    
    if relative:
        total = sum([v for k,v in tf.items()])
        tf = {k:v/total for k, v in tf.items()}
          
    return tf


def calc_percent(headlines):
    total = len(headlines)
    count = 0
    for h in headlines:
        if 'coronavirus' in h:
            count += 1
    return round(count/total * 100, 2)

def word_counts(headlines):
    """
    Input: a list of headlines from news articles
    Output: full word counts for all words from the articles
    """
    # define empty dict to collect all words
    tf = {}
    
    # preprocess all headlines
    headlines = headlines.apply(lambda x: preprocess(x))
    
    # calc token frequency of all word tokens in headlines
    headlines.apply(lambda x: token_frequency(x,tf=tf))
    
    # sort tokens by frequency 
    sorted_tokens = sorted(tf.items(), key= lambda x:x[1], reverse=True)
    for token in sorted_tokens:    #vector = {k:counts[v] for v,k in enumerate(labels)}
        print(token)

In [3]:
# read in wsj articles
df = pd.read_csv('cnbc_news.csv',parse_dates=['publish_date'], index_col='publish_date').reset_index()

# drop any rows with null
df = df.dropna()

df['headline'] = df['headline'].apply(lambda x: preprocess(x))
df.tail()

Unnamed: 0,publish_date,headline
812,2020-03-17,"[cramer, calls, on, government, to, enlist, co..."
813,2020-03-17,"[trump, seeking, $1, trillion, stimulus, packa..."
814,2020-03-17,"[espn, reveals, how, it, will, move, forward, ..."
815,2020-03-17,"[regulators, consider, loosening, bank, liquid..."
816,2020-03-17,"[travel, industry, pushes, for, $150, billion,..."


In [4]:
percent_per_day = df.groupby('publish_date')['headline'].apply(lambda x: calc_percent(x))[4:]
print(percent_per_day)

publish_date
2020-03-08    33.33
2020-03-09    43.14
2020-03-10    43.10
2020-03-11    47.48
2020-03-12    50.83
2020-03-13    58.77
2020-03-14    61.90
2020-03-15    53.33
2020-03-16    52.80
2020-03-17    45.22
Name: headline, dtype: float64


In [5]:
word_counts(df.loc[252:,'headline'])

('coronavirus', 301)
('to', 296)
('the', 170)
('and', 106)
('for', 103)
('in', 98)
('as', 84)
('says', 83)
('a', 81)
('of', 75)
('market', 68)
('trump', 65)
('is', 64)
('on', 52)
('from', 50)
('new', 46)
('stocks', 44)
('us', 43)
('live', 43)
('stock', 42)
('are', 39)
('will', 35)
('at', 35)
('updates', 34)
('be', 29)
('amid', 29)
('due', 28)
('more', 28)
('after', 27)
('it', 26)
('with', 26)
('could', 26)
('house', 25)
('travel', 24)
('dow', 23)
('by', 23)
('outbreak', 22)
('up', 22)
('fed', 21)
('down', 21)
('help', 21)
('how', 20)
('crisis', 19)
("here's", 19)
('can', 19)
('not', 19)
('york', 19)
('all', 19)
('pandemic', 18)
('billion', 18)
('white', 18)
('over', 18)
('have', 17)
('home', 17)
('cases', 17)
('recession', 17)
('its', 17)
('may', 16)
('buy', 16)
('some', 16)
('now', 15)
('markets', 14)
('what', 14)
('employees', 14)
('work', 14)
('you', 14)
('about', 14)
('these', 14)
('ban', 13)
('bill', 13)
('amazon', 13)
('30', 13)
('stimulus', 13)
('but', 13)
('mnuchin', 13)
('this

('happening', 1)
('insists', 1)
('ioc', 1)
('skepticism', 1)
('creep', 1)
('wisely', 1)
('corporations', 1)
('broke', 1)
('rule', 1)
('personal', 1)
('grip', 1)
('primaries', 1)
('finally', 1)
('turn', 1)
('basic', 1)
('income', 1)
('favored', 1)
('reality', 1)
('cult', 1)
('trends', 1)
('pot', 1)
('alternative', 1)
('meat', 1)
('hardest', 1)
('1,700', 1)
('hospitalizing', 1)
('19', 1)
('reaches', 1)
('respirators', 1)
('sheryl', 1)
('payment', 1)
('tom', 1)
('brady', 1)
('patriots', 1)
('potentially', 1)
('direct', 1)
('hired', 1)
('openings', 1)
('softbank', 1)
('complete', 1)
('$3', 1)
('tender', 1)
('wework', 1)
('fedex', 1)
('mongodb', 1)
('dc', 1)
('faa', 1)
('tower', 1)
("chicago's", 1)
('midway', 1)
('airport', 1)
('several', 1)
('dips', 1)
('sell', 1)
('pops', 1)
('employment', 1)
('factory', 1)
('musk', 1)
('personally', 1)
('adding', 1)
('flowing', 1)
('institutions', 1)
('leading', 1)
('paradigm', 1)
('needham', 1)
('defensive', 1)
('p&g', 1)
('gundlach', 1)
('ludicrous', 1