In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
import ast

In [2]:
# Preprocessing

def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    #data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    #data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [3]:
df = pd.read_csv('../Word2Vec/subneighborhood_separated_articles/2014.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,wellington_hill,southern_mattapan,west_codman_hill,west_lower_mills,washington_hill,mattapan,franklin_field_south,dorchester_center,dorchester,bowdoin_north,...,st._elizabeth's,china_town,aberdeen,bellevue_hill,downtown_crossing,chestnut_hill_neighborhood_association,back_bay_west,west_end,telegraph_hill,beachmont
0,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')",('nGovernor Deval Patrick and several other po...,"('no article', 'no_id')","('no article', 'no_id')",('Miosoti Santana watched with a grin Sunday a...,"('no article', 'no_id')",...,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')"
1,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')",('At Morning Star Baptist Church on Sunday Bi...,"('no article', 'no_id')","('no article', 'no_id')",('Supporters of former state senator Dianne Wi...,"('no article', 'no_id')",...,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')"
2,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')",('The Massachusetts House voted today to expel...,"('no article', 'no_id')","('no article', 'no_id')",('More than three years after she disappeared ...,"('no article', 'no_id')",...,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')"
3,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')",('Rev Bruce Wall is one of those guys who can...,"('no article', 'no_id')","('no article', 'no_id')",('Market Basket workers don’t have a union Bu...,"('no article', 'no_id')",...,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')"
4,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')",('Boston police seized five illegal firearms i...,"('no article', 'no_id')","('no article', 'no_id')",('To those who watched Mark Wahlberg transform...,"('no article', 'no_id')",...,"('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')","('no article', 'no_id')"


In [4]:
df = df.fillna("('no article', 'no_id')")
df['hyde_park'] = df['hyde_park'].apply(ast.literal_eval)

In [5]:
sub_df = pd.DataFrame(df['hyde_park'])
sub_df.head()

Unnamed: 0,hyde_park
0,(SANDWICH — The mutilated body was found Wedne...
1,(Boston saw its first two homicides of 2014 on...
2,(Boston police arrested eight men and recovere...
3,(Score one for the frail and defenseless In Ja...
4,(n nFrom a pinch of salt to a drop of water M...


In [6]:
hyde_park_docs = []

counter = 0
for row in sub_df.itertuples(index=False):
    article, article_id = row.hyde_park
    if article != 'no article':
        text = word_tokenize(preprocess(article))
        hyde_park_docs.append((text, article_id))
        counter += 1
        print('article ' + str(counter) + ' done.')
        
print()
print(str(counter) + ' articles total')

article 1 done.
article 2 done.
article 3 done.
article 4 done.
article 5 done.
article 6 done.
article 7 done.
article 8 done.
article 9 done.
article 10 done.
article 11 done.
article 12 done.
article 13 done.
article 14 done.
article 15 done.
article 16 done.
article 17 done.
article 18 done.
article 19 done.
article 20 done.
article 21 done.
article 22 done.
article 23 done.
article 24 done.
article 25 done.
article 26 done.
article 27 done.
article 28 done.
article 29 done.
article 30 done.
article 31 done.

31 articles total


In [7]:
processed_text = []
for art in hyde_park_docs:
    processed_text.append(art)

In [8]:
DF = {}

# keep track of how many documents in a subneighborhood discuss a given token
for i in range(len(hyde_park_docs)):
    tokens = processed_text[i][0]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

for i in DF:
    DF[i] = len(DF[i])

In [9]:
total_vocab_size = len(DF)
total_vocab_size

4690

In [10]:
# get the number of documents in which this word occurs
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [11]:
doc = 0

tf_idf = {}

for i in range(len(hyde_park_docs)):
    
    a_id = processed_text[i][1]
    
    # get all the tokenized text for a given neighborhood
    tokens = processed_text[i][0]
    
    # count the number of times each token occurs in the text for a given subneighborhood
    counter = Counter(tokens)
    
    # get the total number of terms for a document
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        
        # compute term frequency
        tf = counter[token] / words_count

        # compute inverse document frequency
        dfr = doc_freq(token)
        idf = np.log((len(hyde_park_docs) + 1) / (dfr + 1))
        
        # compute tf-idf score
        tf_idf[a_id, token] = tf * idf

    doc += 1

In [12]:
tf_idf

{('2014_704', 'abdomen'): 0.004641610584106776,
 ('2014_704', 'added'): 0.004143522299923273,
 ('2014_704', 'affi'): 0.005283758067258074,
 ('2014_704', 'afternoon'): 0.006784936400644701,
 ('2014_704', 'ago'): 0.0028314984943381854,
 ('2014_704', 'alarmed'): 0.006188814112142368,
 ('2014_704', 'also'): 0.0007371466224789661,
 ('2014_704', 'anyone'): 0.004143522299923273,
 ('2014_704', 'anything'): 0.005662996988676371,
 ('2014_704', 'anywhere'): 0.010567516134516147,
 ('2014_704', 'area'): 0.0025963187718876803,
 ('2014_704', 'arms'): 0.004641610584106776,
 ('2014_704', 'around'): 0.002010684252987031,
 ('2014_704', 'asked'): 0.0023835728348244553,
 ('2014_704', 'attorney'): 0.004143522299923273,
 ('2014_704', 'authorities'): 0.005283758067258074,
 ('2014_704', 'awake'): 0.006188814112142368,
 ('2014_704', 'away'): 0.0047671456696489105,
 ('2014_704', 'back'): 0.0014118807114810502,
 ('2014_704', 'bags'): 0.005283758067258074,
 ('2014_704', 'barnstable'): 0.006188814112142368,
 ('2014

In [13]:
article_ids = []
for row in sub_df.itertuples(index=False):
    _, article_id = row.hyde_park
    if article_id != 'no_id':
        article_ids.append(article_id)

In [14]:
tf_idf_hyde_park = pd.DataFrame(index=DF.keys(), columns=article_ids)

In [15]:
for key in tf_idf:
    a_id, term = key
    tf_idf_hyde_park.loc[term][a_id] = tf_idf[key]

In [16]:
tf_idf_hyde_park = tf_idf_hyde_park.fillna(0.0)
tf_idf_hyde_park

Unnamed: 0,2014_704,2014_243,2014_853,2014_892,2014_670,2014_81,2014_721,2014_969,2014_630,2014_1099,...,2014_1107,2014_83,2014_749,2014_1159,2014_654,2014_818,2014_490,2014_712,2014_798,2014_381
sandwich,0.018566,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.012415,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
mutilated,0.005284,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.007066,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
body,0.037366,0.005978,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.049969,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
found,0.014301,0.000000,0.011998,0.001816,0.0,0.001728,0.0,0.003055,0.0,0.006375,...,0.0,0.0,0.0,0.0,0.002378,0.0,0.0,0.0,0.0,0.000000
wednesday,0.018683,0.000000,0.000000,0.002847,0.0,0.000000,0.0,0.000000,0.0,0.009994,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
redevelopment,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.006054
aggravation,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.006054
alienating,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.006054
friendly,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.006054


In [17]:
for col in tf_idf_hyde_park.columns:
    temp = pd.DataFrame(tf_idf_hyde_park[col])
    temp.columns = ['weight']
    temp = temp.sort_values('weight', ascending=False)
    temp.to_csv('Yearly_TFIDF_Scores_by_Subneighborhood/2014/hyde_park/TFIDF_' + col + '.csv')