In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
import re
import gensim
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
import ast

In [2]:
# Preprocessing

def convert_lower_case(data):
    return np.char.lower(str(data))

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    #data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    #data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [3]:
years = [2014]
text_corpus = pd.DataFrame()
for year in years:
    temp = pd.read_csv('globe_data/bostonglobe' + str(year) + '.csv')
    text_corpus = pd.concat([text_corpus, temp], axis=0)

def custom_standardization(data):

    spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",",
                      "-",".","/",":",";","<", "=",">","?","@","[",
                      "\\","]","^","_", "`","{","|","}","~","–", 
                      "\xc2", "\xa0", "\x80", "\x9c", "\x99", "\x94", 
                      "\xad", "\xe2", "\x9d", "\n", "x9d", "xc2", 
                      "xa0", "x80", "x9c", "x99", "x94", "xad", "xe2"]

    for char in spec_chars:
        # did not stem/lemmatize since doing so gave weird errors 
        # words found in an article didn't exist in the corpus for that year
        data['text'] = data['text'].str.strip()
        data['text'] = data['text'].str.replace(char, ' ')
        data['text'] = data['text'].str.lower()

    return data

text_corpus = custom_standardization(text_corpus)
print('corpus standardized')
print()
    
# turn DataFrame into a list of lists of tokens
documents = []
for row in text_corpus.values:
    [row] = row
    temp = row.lower().split()
    documents.append(temp)

# create Word2Vec model
# the skip-grams method is used here, with a window of 10
model = gensim.models.Word2Vec(window=10, min_count=2, sg=1, workers=10)
model.build_vocab(documents)  # prepare the model vocabulary

# train model on available data
# I use 5 epochs since that's standard
model.train(corpus_iterable=documents, total_examples=len(documents), epochs=5)

corpus standardized



(7129399, 9147370)

In [4]:
df = pd.read_csv('../Word2Vec/subneighborhood_separated_articles/2014.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df = df.fillna("('no article', 'no_id')")
df['hyde_park'] = df['hyde_park'].apply(ast.literal_eval)

In [5]:
sub_df = pd.DataFrame(df['hyde_park'])
sub_df.head()
article_ids = []
for row in sub_df.itertuples(index=False):
    _, article_id = row.hyde_park
    if article_id != 'no_id':
        article_ids.append(article_id)

In [12]:
# computation done only for hyde park articles from 2014

In [11]:
for a_id in article_ids:
    print('starting work with ' + a_id)
    print()
    a_id_TFIDF = pd.DataFrame()
    for year in years:
        data = pd.read_csv('../TF-IDF/Yearly_TFIDF_Scores_by_Subneighborhood/' + str(year) + '/hyde_park/TFIDF_' + a_id + '.csv')
        data.columns = ['term', 'weight']
        a_id_TFIDF = pd.concat([a_id_TFIDF, data], axis=0)

    a_id_TFIDF = a_id_TFIDF.sort_values('weight', ascending=False)
    print(a_id + ' term weights sorted')
    
    keywords = []
    for row in a_id_TFIDF.itertuples(index=False):
        if len(keywords) < 15 and row.term not in keywords:
            if row.term != 'keefe' and row.term != 'dunkin' and row.term != 'audi' and row.term != 'incuding' and row.term != 'carty' and row.term != 'toole':# and row.term != 'beach' and row.term != 'dolli' and row.term != 'neck' and row.term != 'wednesday':
                keywords.append(row.term)

    # finding similar words and creating a csv file

    def compute_similar_words(model,source_word, topn=5):
        similar_words = [source_word]
        try:
            top_words = model.wv.most_similar(source_word, topn=topn)
            similar_words.extend([val[0] for val in top_words])
        except KeyError as err:
            print(err.args)
        return similar_words    

    def compute_similar_words_for_all_tasks(model, topn=5):
        columns = ['word' + str(i - 1) for i in range(1, topn + 2)]
        df = pd.DataFrame(data=None, columns=columns)
        for source_word in keywords:
            similar_words = compute_similar_words(model, source_word, topn)
            df.loc[len(df)] = similar_words
        df.to_csv('similar_words_task/subneighborhood_TFIDF/hyde_park/' + a_id + '_similar_words.csv')
    
    words = compute_similar_words_for_all_tasks(model)
    print(a_id + ' similar words to most important terms generated')
    print()

starting work with 2014_704

2014_704 term weights sorted
2014_704 similar words to most important terms generated

starting work with 2014_243

2014_243 term weights sorted
2014_243 similar words to most important terms generated

starting work with 2014_853

2014_853 term weights sorted
2014_853 similar words to most important terms generated

starting work with 2014_892

2014_892 term weights sorted
2014_892 similar words to most important terms generated

starting work with 2014_670

2014_670 term weights sorted
2014_670 similar words to most important terms generated

starting work with 2014_81

2014_81 term weights sorted
2014_81 similar words to most important terms generated

starting work with 2014_721

2014_721 term weights sorted
2014_721 similar words to most important terms generated

starting work with 2014_969

2014_969 term weights sorted
2014_969 similar words to most important terms generated

starting work with 2014_630

2014_630 term weights sorted
2014_630 similar 