In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
import ast

## Preprocessing

In [7]:
def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    # data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    # data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Creating documents out of the neighborhood-separated articles

In [2]:
processed_text = []

In [3]:
df = pd.read_csv('../Entity_Recognition/Neighborhood_Separated_Articles/2014.csv')

In [4]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,dorchester,roxbury,mattapan,hyde_park,fenway,beacon_hill,downtown,south_boston,east_boston,back_bay,...,charlestown,brighton,allston,west_end,roslindale,north_end,mission_hill,harbor_islands,longwood_medical_area,west_roxbury
0,('Before entering a South African prison in 19...,('n individuals who worked in finance insuran...,('The public scrap over who will serve as the ...,('The Boston Police Department has issued a co...,('Wahlburgers the burger joint owned by Dorch...,('Martin J Walsh becomes the 48th mayor of Bo...,('Last week Mayor Marty Walsh declared to the...,('The 20 year old who allegedly beat a disable...,('Saying that Massachusetts is facing “the big...,('First Night Boston almost didn’t happen this...,...,('Bright skies belie the early spring chill as...,('Boston’s new mayor Martin J Walsh has rep...,('A small army of city inspectors will soon be...,('Shakespeare’s “A Midsummer Night’s Dream” se...,('Last February Brandon John joined more than...,('When the offered season ticket holders the ...,('They were immigrants from Iraq and India th...,('Rain crashes down on onto the tarp thunder ...,"('no article', 'no_id')",('It’s been almost a full month since Thomas M...
1,('A woman’s toe was bitten off during a fight ...,('Dr Kenneth C Edelin whose historic 1975 m...,('In what police described as a horrific trage...,('nOn the weekend before he was to be sworn in...,('n n n and are expanding their burger bus...,('One bone chilling Saturday night about 11 mo...,('The Federal Reserve must take a “patient app...,('n and Michelle Wu stood side by side at the ...,('MEMPHIS — The thank yous were almost endless...,('Chances are if you walked past the firehous...,...,('As neighbors living a world apart they awok...,('The 26 year old Chinese entrepreneur had jus...,('n n for help The harrowing leaps from smoky...,('nis here the nascent organizing committee...,('Don’t ask how the sausage is made the old s...,('As America’s most Irish American city it co...,('Beth Israel Deaconess Medical Center said We...,"('no article', 'no_id')","('no article', 'no_id')",('Mayor Martin J Walsh’s pick for acting fire...
2,('It’s the ideal time for a tech journalist to...,('When they go up they form the centerpiece o...,('The fatal shooting today of a 9 year old Mat...,('Boston saw its first two homicides of 2014 o...,('Mayor Martin J Walsh plans to set aside spa...,('nhe Boston Public Library is using three dim...,('ncooler buses For decades economists like ...,('Jack Conley who grew up in South Boston and...,('Marty Walsh has turned the casino licensing ...,('Stores selling luxury goods — once modest an...,...,('When Boston firefighters pulled up to a vaca...,('It was just some children playing baseball o...,('n n for help The harrowing leaps from smoky...,('nis here the nascent organizing committee...,('Former Boston councilor Felix D Arroyo the...,('In nine days Wegmans will open a new superm...,('The Rev Joseph Hung Duc Tran had been ordai...,"('no article', 'no_id')","('no article', 'no_id')",('His cousin would come to Children’s Hospital...
3,('The first major snowstorm of 2014 has intens...,('Good Morning Chief Justice Ireland Presiden...,('During a highly publicized Boston Police Dep...,('An elderly woman became Boston’s first fire ...,('By last Christmas Frank Nunes seemed to hav...,('QUINCY — Massachusetts Democrats saw a new B...,('When Ari S Heckman was growing up in Provid...,('Mayor Martin J Walsh said Wednesday that he...,('Marty Walsh has turned the casino licensing ...,('A midday altercation Friday in a bustling co...,...,('Two Charlestown teenagers — a 14 year old gi...,('Boston police are asking for the public’s he...,('n n for help The harrowing leaps from smoky...,('The first thing everyone noticed about Dawnn...,('nn the middle of evening rush hour at South ...,('n n n nThe 150 dancing singing and drummin...,('nLittle about the ordinary looking two famil...,"('no article', 'no_id')","('no article', 'no_id')",('The is offering 15 percent off purchases fr...
4,('One Fund Boston which initially collected $...,('Good Morning Chief Justice Ireland Presiden...,('At Greater Love Tabernacle Church today mor...,('n officials in Boston want to give 4 500 mid...,('SALEM — Martha Coakley had already made the ...,('The state’s highest court has thrown out the...,('Marty Walsh had never met Arianna Huffington...,('The state’s largest gay rights organization ...,('While he may not have been a household name ...,('nn a brisk November morning in 2001 Hubert ...,...,('A Charlestown man was sentenced to up to 12 ...,('noffice Mayor Marty Walsh has properly focu...,('Mayor Martin J Walsh replaced on Monday the...,('Dawnn Jaffier the 26 year old Brighton woma...,('nn the middle of evening rush hour at South ...,('“Maybe I need a psychiatrist ” says Donato F...,('For 24 years Ellen Wade and Maureen Brodoff...,"('no article', 'no_id')","('no article', 'no_id')",('A 33 year old man with a criminal record dat...


In [5]:
df = df.fillna("('no article', 'no_id')")
df['dorchester'] = df['dorchester'].apply(ast.literal_eval)
df['roxbury'] = df['roxbury'].apply(ast.literal_eval)
df['mattapan'] = df['mattapan'].apply(ast.literal_eval)
df['hyde_park'] = df['hyde_park'].apply(ast.literal_eval)
df['fenway'] = df['fenway'].apply(ast.literal_eval)
df['beacon_hill'] = df['beacon_hill'].apply(ast.literal_eval)
df['downtown'] = df['downtown'].apply(ast.literal_eval)
df['south_boston'] = df['south_boston'].apply(ast.literal_eval)
df['east_boston'] = df['east_boston'].apply(ast.literal_eval)
df['back_bay'] = df['back_bay'].apply(ast.literal_eval)
df['jamaica_plain'] = df['jamaica_plain'].apply(ast.literal_eval)
df['south_end'] = df['south_end'].apply(ast.literal_eval)
df['charlestown'] = df['charlestown'].apply(ast.literal_eval)
df['brighton'] = df['brighton'].apply(ast.literal_eval)
df['allston'] = df['allston'].apply(ast.literal_eval)
df['west_end'] = df['west_end'].apply(ast.literal_eval)
df['roslindale'] = df['roslindale'].apply(ast.literal_eval)
df['north_end'] = df['north_end'].apply(ast.literal_eval)
df['mission_hill'] = df['mission_hill'].apply(ast.literal_eval)
df['harbor_islands'] = df['harbor_islands'].apply(ast.literal_eval)
df['longwood_medical_area'] = df['longwood_medical_area'].apply(ast.literal_eval)
df['west_roxbury'] = df['west_roxbury'].apply(ast.literal_eval)

In [6]:
documents = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}

for col in df.columns:
    tokens = []
    for i in range(df.shape[0]):
        article, _ = df.loc[i][col]
        if article != 'no article':
            text = word_tokenize(preprocess(article))
            tokens = tokens + text
    documents[col] = tokens
    print(col + ' DONE')

NameError: name 'preprocess' is not defined

In [87]:
processed_text = []
for key in documents:
    processed_text.append(documents[key])

In [88]:
#processed_text

In [89]:
DF = {}

# keep track of how many neighborhoods' documents discuss a given token
for i in range(len(df.columns)):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

for i in DF:
    DF[i] = len(DF[i])

In [90]:
total_vocab_size = len(DF)
total_vocab_size

33882

In [91]:
# get the number of documents in which this word occurs
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [92]:
doc = 0

tf_idf = {}

for i in range(len(df.columns)):
    
    # get all the tokenized text for a given neighborhood
    tokens = processed_text[i]
    
    # count the number of times each token occurs in the text for a given neighborhood
    counter = Counter(tokens)
    
    # get the total number of terms for a document (given neighborhood)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        
        # compute term frequency
        tf = counter[token] / words_count

        # compute inverse document frequency
        dfr = doc_freq(token)
        idf = np.log((len(df.columns) + 1) / (dfr + 1))
        
        # compute tf-idf score
        tf_idf[doc, token] = tf * idf

    doc += 1

In [93]:
#tf_idf

In [108]:
subs = df.columns

In [95]:
tf_idf_scores = {'dorchester': [], 'roxbury': [], 'mattapan': [], 'hyde_park': [], 'fenway': [],
       'beacon_hill': [], 'downtown': [], 'south_boston': [], 'east_boston': [], 'back_bay': [],
       'jamaica_plain': [], 'south_end': [], 'charlestown': [], 'brighton': [], 'allston': [],
       'west_end': [], 'roslindale': [], 'north_end': [], 'mission_hill': [], 'harbor_islands': [],
       'longwood_medical_area': [], 'west_roxbury': []}

In [96]:
for key in tf_idf:
    sub_ind, term = key
    tf_idf_scores[subs[sub_ind]].append((term, tf_idf[key]))

In [97]:
for col in subs:
    tf_idf_scores[col].sort(reverse=True)
    temp = pd.DataFrame(tf_idf_scores[col], columns=['term', 'weight'])
    temp = temp.sort_values(by=['weight'], ascending=False)
    temp.to_csv('Yearly_TFIDF_Scores_by_Subneighborhood/2014/TFIDF_' + col + '.csv')

In [98]:
col

'west_roxbury'

In [99]:
# most imp words for a neighborhood using weights
# look for similar words to most-important terms for each

# most important terms here are the top 15-20 terms for a given neighborhood
# across all years

# check to see if duplicate articles exist; if those do, remove those

# look at how similar TF-IDF results are for different neighborhoods
# as well as the most-similar words

# check if we get similar results from similar neighborhoods
# are black and white distinctive, or are all unique, or 
# is there no clear pattern?

In [100]:
# eventual goal is to assign topic to each article

# when looking at tf-idf scores, if there is a clear trend
# in the top words for a given neighborhood, that might be indicative
# of the general topic associated with a neighborhood