In [1]:
import pandas as pd
import gensim
from collections import defaultdict

In [2]:
#'south_boston_waterfront' not included
subs = ['fenway', 'beacon_hill', 'downtown', 'south_boston', 'east_boston', 'back_bay', 'jamaica_plain',
        'south_end', 'charlestown', 'brighton', 'allston', 'west_end', 'roslindale', 'north_end',
        'mission_hill', 'harbor_islands', 'longwood_medical_area', 'dorchester', 'roxbury', 'mattapan', 'hyde_park']

In [4]:
years = [2014, 2015, 2016, 2017, 2018]
text_corpus = pd.DataFrame()
for year in years:
    temp = pd.read_csv('globe_data/bostonglobe' + str(year) + '.csv')
    text_corpus = pd.concat([text_corpus, temp], axis=0)

def custom_standardization(data):

    spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",",
                      "-",".","/",":",";","<", "=",">","?","@","[",
                      "\\","]","^","_", "`","{","|","}","~","–", 
                      "\xc2", "\xa0", "\x80", "\x9c", "\x99", "\x94", 
                      "\xad", "\xe2", "\x9d", "\n", "x9d", "xc2", 
                      "xa0", "x80", "x9c", "x99", "x94", "xad", "xe2"]

    for char in spec_chars:
        data['text'] = data['text'].str.strip()
        #data['text'] = str(data['text']).lower()
        data['text'] = data['text'].str.replace(char, ' ')
        #data['text'] = stemmer.stem(str(data['text']))

    return data

text_corpus = custom_standardization(text_corpus)
print('corpus standardized')
print()
    
# turn DataFrame into a list of lists of tokens
documents = []
for row in text_corpus.values:
    [row] = row
    temp = row.lower().split()
    documents.append(temp)

# create Word2Vec model
# the skip-grams method is used here, with a window of 10
model = gensim.models.Word2Vec(window=10, min_count=2, sg=1, workers=10)
model.build_vocab(documents)  # prepare the model vocabulary

# train model on available data
# I use 5 epochs since that's standard
model.train(corpus_iterable=documents, total_examples=len(documents), epochs=5)

for sub in subs:
    print('starting work with ' + sub)
    print()
    sub_TFIDF = pd.DataFrame()
    for year in years:
        data = pd.read_csv('../TF-IDF/Yearly_TFIDF_Scores_by_Neighborhood/' + str(year) + '/' + 'TFIDF_' + sub + '.csv')
        data = data.drop(['Unnamed: 0'], axis=1)
        sub_TFIDF = pd.concat([sub_TFIDF, data], axis=0)

    sub_TFIDF = sub_TFIDF.sort_values('weight', ascending=False)
    print(sub + ' term weights sorted')
    
    keywords = []
    for row in sub_TFIDF.itertuples(index=False):
        if len(keywords) < 15 and row.term not in keywords:
            if row.term != 'hokule':
                keywords.append(row.term)

    # finding similar words and creating a csv file

    def compute_similar_words(model,source_word, topn=5):
        similar_words = [source_word]
        try:
            top_words = model.wv.most_similar(source_word, topn=topn)
            similar_words.extend([val[0] for val in top_words])
        except KeyError as err:
            print(err.args)
        return similar_words    

    def compute_similar_words_for_all_tasks(model, topn=5):
        columns = ['word' + str(i - 1) for i in range(1, topn + 2)]
        df = pd.DataFrame(data=None, columns=columns)
        for source_word in keywords:
            similar_words = compute_similar_words(model, source_word, topn)
            df.loc[len(df)] = similar_words
        df.to_csv('similar_words_task/neighborhood_TFIDF/' + sub + '_similar_words.csv')
    
    words = compute_similar_words_for_all_tasks(model)
    print(sub + ' similar words to most important terms generated')
    print()

corpus standardized

starting work with fenway

fenway term weights sorted
fenway similar words to most important terms generated

starting work with beacon_hill

beacon_hill term weights sorted
beacon_hill similar words to most important terms generated

starting work with downtown

downtown term weights sorted
downtown similar words to most important terms generated

starting work with south_boston

south_boston term weights sorted
south_boston similar words to most important terms generated

starting work with east_boston

east_boston term weights sorted
east_boston similar words to most important terms generated

starting work with back_bay

back_bay term weights sorted
back_bay similar words to most important terms generated

starting work with jamaica_plain

jamaica_plain term weights sorted
jamaica_plain similar words to most important terms generated

starting work with south_end

south_end term weights sorted
south_end similar words to most important terms generated

starting 

ValueError: cannot set a row with mismatched columns

In [13]:
# allston: lots of names; a few different terms, but no clear pattern
# back bay: a couple names; lots of words to do with biology/biotech
# beacon hill: lots of names; one legal/law enforcement term
# brighton: lots of names; one legal/law enforcement term
# charlestown: no names; several terms to do with shapes; one legal/law enforcement term
# downtown: lots of names; a couple legal/law enforcement terms
# east boston: a couple names; a couple technical terms; no clear pattern
# fenway: a few names; one political term; one legal/law enforcement term (debatable)
# harbor islands: lots of names; several different terms, but no clear pattern
# jamaica plain: no names; a lot of camping/sailing related terms
# longwood medical center: almost all names
# mission hill: almost all names
# north end: several names; a couple legal/law enforcement terms
# roslindale: several names; one legal/law enforcement related term; a couple sports related terms
# south boston: a lot of names; a couple legal/law enforcement terms
# south end: a lot of names; no clear pattern among the rest
# west end: almost all names; one political term

# dorchester: a couple names; one political term
# hyde park: almost all names
# mattapan: a couple names; a couple political terms; several sport (football) related terms
# roxbury: several names; a couple political terms; a couple legal/law enforcement related terms

In [14]:
# trend across white neighborhoods: occurrence of terms to do with legal/law enforcement
# trend across black neighborhoods: occurrence of terms to do with politics

In [15]:
# potential next step: remove names from the corpus and repeat the entire process
# concern: doing the exact same thing for Derry; currently struggling with removing names

In [1]:
# potential further step: compare the same word w.r.t. different neighborhoods 
# for example, generate word embeddings for the articles for each neighborhood and compare the word embeddings for a given term

# lemmatize articles, remove stopwords, remove names from corpus, repeat entire process

In [2]:
# can narrow down focus to work with individual articles
# build word embedding on an article
# look for certain keywords using some method and see how close those keywords are to their vectors from the entire corpus for that neighborhood (or perhaps, group of neighborhoods)

# let's say you have an article talking about an election in dorchester
# build a word embedding model on article
# take keyword election
# extract closest word to that keyword
# take vector embedding for election and compare its vector with the
    # rest of the articles for the neighborhood
# this is essentially an extra step in case top TF-IDF words don't give enough information

# also separate articles by sub-neighborhood

In [29]:
#1 cleaning the data (remove duplicates) DONE
#2 separate black and white neighborhoods and then by sub-neighborhood DONE
#3 work with a subset of the data
#4 get TF-IDF weights for each article in that subset
    #4.5 article:sub-neighborhood articles = neighborhood articles:text corpus
#5 get word embeddings for the top words
#6 get sub-neighborhood names from here: https://drive.google.com/file/d/1le8X9VQwO-cM4VVAAb4quAlgerr94T0x/view