In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [69]:
df=pd.read_csv('news_file_spd.csv')
df

Unnamed: 0,title,media,desc
0,Shooting in 23rd and Jackson parking lot sends...,CHS Capitol Hill Seattle,Seattle Police and Seattle Fire were called to...
1,Five Easy Ways to Increase Crime in Your City,Newsmax,Seattle's Jason Rantz published an analysis of...
2,New chamber of commerce in Seattle's internati...,Q13 FOX,"PL Davis, Commander of Collaborative Police Bu..."
3,5 SPD officers disciplined for potential viola...,Flipboard,Seattle's Office of Police Accountability (OPA...
4,City May Relinquish Control Over Homelessness ...,PubliCola,3. When Seattle's 911 dispatch center left the...
...,...,...,...
255,More Seattle police officers admit they attend...,KUOW,The Office of Police Accountability is investi...
256,Seattle Police Department: Three more officers...,,The Seattle Police Department said three more ...
257,5 Seattle officers now under investigation for...,Crosscut,... two during a meeting with Seattle's Commun...
258,Three more Seattle police officers report bein...,,The Seattle Police Department says three more ...


In [70]:
#Number of unique news sources
len(df.media.unique())

40

In [71]:
#Top 10 news sources
top_10=df.media.value_counts().head(10)
top_10

MyNorthwest.com             33
KOMO                        21
KIRO-TV                     20
South Seattle Emerald       20
CHS Capitol Hill Seattle    19
The Seattle Times           18
KING 5                      11
Q13 FOX                     10
Patch                        9
KUOW                         8
Name: media, dtype: int64

In [72]:
#Tokenize and Normalize (standardize) the text by removing all punctuation 
#from 'title' and 'desc' making all text lower case

df.title = df.title.apply(lambda x: nltk.RegexpTokenizer(r"\w+").tokenize(x.lower()))
df.desc = df.desc.apply(lambda x: nltk.RegexpTokenizer(r"\w+").tokenize(x.lower()))

In [73]:
#Remove Stop works from the text
df.title = df.title.apply(lambda x: [word for word in x if word not in stopwords.words('english')])
df.desc = df.desc.apply(lambda x: [word for word in x if word not in stopwords.words('english')])

In [74]:
df.head()

Unnamed: 0,title,media,desc
0,"[shooting, 23rd, jackson, parking, lot, sends,...",CHS Capitol Hill Seattle,"[seattle, police, seattle, fire, called, parki..."
1,"[five, easy, ways, increase, crime, city]",Newsmax,"[seattle, jason, rantz, published, analysis, s..."
2,"[new, chamber, commerce, seattle, internationa...",Q13 FOX,"[pl, davis, commander, collaborative, police, ..."
3,"[5, spd, officers, disciplined, potential, vio...",Flipboard,"[seattle, office, police, accountability, opa,..."
4,"[city, may, relinquish, control, homelessness,...",PubliCola,"[3, seattle, 911, dispatch, center, left, seat..."


In [122]:
from nltk.probability import FreqDist

#Initialize FreqDist objects
fdist_title = FreqDist()
fdist_desc = FreqDist()
#Measure of the frequency of words in the 'title' column
for lists_ in df['title']:
    for words in lists_:
        fdist_title[words]+=1
        
title_word_freq = pd.Series(fdist_title, name='freq_title').sort_values(ascending=False)

#Measure of the frequency of words in the 'desc' column
for lists_ in df['desc']:
    for words in lists_:
        fdist_desc[words]+=1
        
desc_word_freq = pd.Series(fdist_desc, name='freq_desc').sort_values(ascending=False)



In [134]:
#Calculating the top 50 words 
word_freq_table = pd.DataFrame(desc_word_freq).join(title_word_freq)

word_freq_table['freq_total']= word_freq_table['freq_desc']+word_freq_table['freq_title']
word_freq_table = word_freq_table.fillna(int(0))
word_freq_table.head(50).sort_values(by=['freq_total','freq_desc'], ascending=False)

Unnamed: 0,freq_desc,freq_title,freq_total
seattle,310,162.0,472.0
police,241,108.0,349.0
spd,141,69.0,210.0
department,162,17.0,179.0
officers,62,27.0,89.0
city,53,28.0,81.0
council,24,21.0,45.0
man,21,23.0,44.0
year,32,9.0,41.0
chief,24,17.0,41.0


In [135]:
#Save clean, preprocessed dataframe to csv
df.to_csv('news_clean_tokens.csv', index=False)