In [1]:
import pandas as pd

In [5]:
def tokenize(s):
    '''
    Input: 
        String
    Output: 
        List of Strings
    
    '''
    return s.split()

def preprocess(s, lower=True, strip_punc=True):
    '''
    Input: String, lower(Bool), strip_punc(Bool)
    Output: List of Strings
    '''
    punc = '.-,?<>:;"\'!%'
    if isinstance(s, str):
        s = tokenize(s)
    if lower:
        s = [t.lower() for t in s]
    if strip_punc:
        s = [t.strip(punc) for t in s]
        
    return s

def token_frequency(tokens, tf= None, relative=False):
    """
    Inputs: 
        tokens = List of Strings or None
        tf = dict or None
        relative = Boolean
    Output: 
        Dictionary of a token frequencies
    """
    tf = {} if tf==None else tf
    
    if len(tf) != 0 and relative==True:
        if isinstance(list(tf.items())[0][1], float):
            print('WARNING: Adding raw counts to relative frequency')
            return tf
        
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    
    if relative:
        total = sum([v for k,v in tf.items()])
        tf = {k:v/total for k, v in tf.items()}
          
    return tf

def word_counts(headlines):
    """
    Input: a list of headlines from news articles
    Output: full word counts for all words from the articles
    """
    # define empty dict to collect all words
    tf = {}
    
    # preprocess all headlines
    headlines = headlines.apply(lambda x: preprocess(x))
    
    # calc token frequency of all word tokens in headlines
    headlines.apply(lambda x: token_frequency(x,tf=tf))
    
    # sort tokens by frequency 
    sorted_tokens = sorted(tf.items(), key= lambda x:x[1], reverse=True)
    for token in sorted_tokens:    #vector = {k:counts[v] for v,k in enumerate(labels)}
        print(token)

In [22]:
# read in wsj articles
df = pd.read_csv('wsj_news.csv',parse_dates=['publish_date'])

# drop all articles that do not have a date associated with it 
df = df[df['publish_date'].isnull() == False]

# fill null values in any cell with an empty string
df.fillna('', inplace=True)

# combine both headlines and descriptions into one col
df['all_words'] = df['headline'] + ' ' + df['description']
df.tail()

Unnamed: 0,publish_date,headline,description,all_words
1187,2020-03-17,The Vintage Datsun That Quietly Turned Trendy,These 2000 models from 1968 were an afterthoug...,The Vintage Datsun That Quietly Turned Trendy ...
1188,2020-03-17,New Normal Amid Coronavirus: Working From Home...,"Tapping grandparents, other backup babysitters...",New Normal Amid Coronavirus: Working From Home...
1189,2020-03-17,Stocks Extend Gains In Choppy Trading,"Dow industrials, S&P 500 rise more than 4% in ...",Stocks Extend Gains In Choppy Trading Dow indu...
1190,2020-03-17,"Coronavirus Upends Senior-Housing Business, Se...",Any additional contagion in care facilities co...,"Coronavirus Upends Senior-Housing Business, Se..."
1191,2020-03-17,Coronavirus Pushes Factories to Stagger Shifts...,Manufacturers in the U.S. keep producing for n...,Coronavirus Pushes Factories to Stagger Shifts...


In [23]:
word_counts(df['all_words'])

('to', 988)
('the', 970)
('of', 610)
('in', 590)
('a', 517)
('and', 466)
('for', 325)
('as', 270)
('on', 264)
('coronavirus', 257)
('is', 225)
('u.s', 213)
('new', 197)
('with', 191)
('are', 183)
('from', 163)
('that', 123)
('it', 113)
('at', 106)
('after', 99)
('more', 97)
('by', 96)
('says', 86)
('over', 85)
('up', 80)
('has', 79)
('but', 71)
('have', 70)
('will', 70)
('an', 69)
('its', 68)
('their', 67)
('about', 66)
('than', 65)
('be', 62)
('trump', 57)
('can', 54)
('how', 54)
('companies', 54)
('first', 53)
('two', 52)
('million', 51)
('cases', 50)
('you', 48)
('could', 48)
('china', 47)
('who', 46)
('what', 45)
('this', 45)
('former', 45)
('some', 45)
('stocks', 45)
('home', 44)
('into', 44)
('york', 44)
('people', 43)
('company', 43)
('president', 43)
('his', 43)
('say', 43)
('was', 42)
('market', 41)
('time', 40)
('ceo', 40)
('they', 39)
('virus', 39)
('investors', 39)
('or', 38)
('business', 38)
('court', 38)
('billion', 36)
('out', 36)
('top', 35)
('global', 35)
('dow', 35)
(

('worry', 3)
('born', 3)
('feb', 3)
('comply', 3)
('improving', 3)
('shoppers', 3)
('barclays', 3)
('abuses', 3)
('closing', 3)
('call', 3)
('800', 3)
('drama', 3)
('fired', 3)
('football', 3)
('photos', 3)
('crimes', 3)
('protests', 3)
('disputed', 3)
('uses', 3)
('dominated', 3)
('great', 3)
('individual', 3)
('unprecedented', 3)
('crazy', 3)
('erdogan', 3)
('high-profile', 3)
('mike', 3)
('carry', 3)
('shock', 3)
('slashes', 3)
('turmoil', 3)
('foods', 3)
('quarantined', 3)
('directors', 3)
('immediate', 3)
('mystery', 3)
('france', 3)
('korean', 3)
('humanitarian', 3)
('wears', 3)
('designer', 3)
('rapidly', 3)
('tom', 3)
('person', 3)
('walmart', 3)
('canada', 3)
('ended', 3)
('grocers', 3)
('‘i’m', 3)
('ideas', 3)
('asked', 3)
('starting', 3)
('jet', 3)
('testing', 3)
('stadiums', 3)
('happens', 3)
('brady', 3)
('contract', 3)
('source', 3)
('break', 3)
('3,000', 3)
('rig', 3)
('playing', 3)
('view', 3)
('shareholder', 3)
('bank’s', 3)
('staley', 3)
('central-bank', 3)
('struggli

('ruder', 1)
('weathered', 1)
('black', 1)
('1980s', 1)
('northwestern', 1)
('scrutinizes', 1)
('sherman', 1)
('organizing', 1)
('unionization', 1)
('stanford', 1)
('berkeley', 1)
('clayton', 1)
('survived', 1)
('flopped', 1)
('cowboy', 1)
('blew', 1)
('1990', 1)
('gubernatorial', 1)
('censures', 1)
('baer', 1)
('fifa', 1)
('identify', 1)
('clients', 1)
('switzerland’s', 1)
('container', 1)
('iranian', 1)
('parliamentary', 1)
('prevented', 1)
('simulation', 1)
('platform', 1)
('improperly', 1)
('gathers', 1)
('tracks', 1)
('behavior', 1)
('frank', 1)
('witter', 1)
('2021', 1)
('german', 1)
('merck', 1)
('scholefield', 1)
('nike', 1)
('coca-cola', 1)
('he’s', 1)
('offset', 1)
('export', 1)
('greyhound', 1)
('friday’s', 1)
('memo', 1)
('consent', 1)
('e*trade', 1)
('‘financial', 1)
('supermarket’', 1)
('grab', 1)
('neutra-designed', 1)
('filmmaker', 1)
('forster', 1)
('stucco', 1)
('santa', 1)
('monica', 1)
('commissioned', 1)
('hopefuls', 1)
('1930s', 1)
('fake-accounts', 1)
('$3', 1)
(

('next%s', 1)
('upended', 1)
('shoe', 1)
('forms', 1)
('colleagues', 1)
('$3.4', 1)
('$3.6', 1)
('malaysia', 1)
('ap', 1)
('exotic', 1)
('imbued', 1)
('passion', 1)
('‘contagion', 1)
('prayer’', 1)
('priests', 1)
('stoke', 1)
('devotion', 1)
('faithful', 1)
('isolated', 1)
('clergy', 1)
('ministering', 1)
('towns', 1)
('twitter’s', 1)
('leap', 1)
('cyclical', 1)
('oddities', 1)
('cycles', 1)
('messy—and', 1)
('referendum', 1)
('vaccine-exemption', 1)
('halting', 1)
('nonmedical', 1)
('exemptions', 1)
('circulating', 1)
('auto-emissions', 1)
('roadblock', 1)
('friction', 1)
('drafting', 1)
('frustrating', 1)
('revamp', 1)
('commemorated', 1)
('slain', 1)
('rebuke', 1)
('formula', 1)
('recoveries', 1)
('high-complexity', 1)
('visual', 1)
('withdraw', 1)
('seemed', 1)
('congenitally', 1)
('unable', 1)
('quit', 1)
('neurochemical', 1)
('bypasses', 1)
('censuring', 1)
('homeowners’', 1)
('misfortune', 1)
('borrowers', 1)
('hurricane', 1)
('flood', 1)
('brush', 1)
('fire—but', 1)
('troubles'

('switched', 1)
('grass', 1)
('endorses', 1)
('tommy', 1)
('tuberville', 1)
('sessions', 1)
('coach', 1)
('accusers', 1)
('closure', 1)
('accuses', 1)
('video-technology', 1)
('turnstyle', 1)
('dirty', 1)
('plastics', 1)
('‘science', 1)
('experiments’', 1)
('dick’s', 1)
('firearms', 1)
('scaled', 1)
('incentives', 1)
('electric-car', 1)
('scouting', 1)
('pickup', 1)
('truck', 1)
('sport-utility', 1)
('vehicle', 1)
('jokes', 1)
('coupon', 1)
('sells', 1)
('paypal', 1)
('$60', 1)
('honey', 1)
('ruan', 1)
('unfinished', 1)
('21,000-square-foot', 1)
('bel-air', 1)
('off-market', 1)
('fever-detecting', 1)
('goggles', 1)
('disinfectant', 1)
('drones', 1)
('solutions', 1)
('bugs', 1)
('glitches', 1)
('asia’s', 1)
('fruit', 1)
('dwindle', 1)
('rebuff', 1)
('payroll-tax', 1)
('mnuchin', 1)
('pelosi', 1)
('record—yale’s', 1)
('princeton’s', 1)
('women—will', 1)
('conference’s', 1)
('automatic', 1)
('tournament', 1)
('sequel', 1)
('footing', 1)
('‘containment', 1)
('area,’', 1)
('rallies', 1)
('1

In [24]:
# group by day and accumulate all text for that day from news
df_text= df.groupby('publish_date')['all_words'].transform(lambda x: ' '.join(x)).drop_duplicates().reset_index()

# sort df by day
df_text.sort_values(by='publish_date', inplace=True)

# preprocess all text for a given day
df_text['all_words'] = df_text['all_words'].apply(lambda x: preprocess(x))

# count token frequencys
df_text['counts'] = df_text['all_words'].apply(lambda x: token_frequency(x,tf=None))

# store coronavirus counts in its own col
df_text['coronavirus_count'] = df_text['counts'].apply(lambda x: x.get('coronavirus',0))

df_text.tail()

KeyError: 'publish_date'

In [98]:
dates = []
freqs = []
for i in range(len(df_text)):
    dates.append(df_text.loc[i,'publish_date'])
    freqs.append(df_text.loc[i,'counts'].get('coronavirus',0))

df_final = pd.DataFrame({'Date':dates,'Coronavirus':freqs})