In [1]:
import pandas as pd
import gensim

In [2]:
#'south_boston_waterfront' not included
subs = ['fenway', 'beacon_hill', 'downtown', 'south_boston', 'east_boston', 'back_bay', 'jamaica_plain',
        'south_end', 'charlestown', 'brighton', 'allston', 'west_end', 'roslindale', 'north_end',
        'mission_hill', 'harbor_islands', 'longwood_medical_area', 'dorchester', 'roxbury', 'mattapan', 'hyde_park']

In [3]:
years = [2014, 2015, 2016, 2017, 2018]
text_corpus = pd.DataFrame()
for year in years:
    temp = pd.read_csv('../../Word2Vec/globe_data/bostonglobe' + str(year) + '.csv')
    text_corpus = pd.concat([text_corpus, temp], axis=0)

def custom_standardization(data):

    spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",",
                      "-",".","/",":",";","<", "=",">","?","@","[",
                      "\\","]","^","_", "`","{","|","}","~","–", 
                      "\xc2", "\xa0", "\x80", "\x9c", "\x99", "\x94", 
                      "\xad", "\xe2", "\x9d", "\n", "x9d", "xc2", 
                      "xa0", "x80", "x9c", "x99", "x94", "xad", "xe2"]

    for char in spec_chars:
        data['text'] = data['text'].str.strip()
        #data['text'] = str(data['text']).lower()
        data['text'] = data['text'].str.replace(char, ' ')
        #data['text'] = stemmer.stem(str(data['text']))

    return data

text_corpus = custom_standardization(text_corpus)
print('corpus standardized')
print()

corpus standardized



In [4]:
# turn DataFrame into a list of lists of tokens
documents = []
for row in text_corpus.values:
    [row] = row
    temp = row.lower().split()
    documents.append(temp)

In [5]:
def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])
data_for_training = list(tagged_document(documents))

model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=5)
model.build_vocab(data_for_training)
model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)

In [8]:
for sub in subs:
    print('starting work with ' + sub)
    print()
    sub_TFIDF = pd.DataFrame()
    for year in years:
        data = pd.read_csv('../../TF-IDF/Yearly_TFIDF_Scores_by_Subneighborhood/' + str(year) + '/' + 'TFIDF_' + sub + '.csv')
        data = data.drop(['Unnamed: 0'], axis=1)
        sub_TFIDF = pd.concat([sub_TFIDF, data], axis=0)

    sub_TFIDF = sub_TFIDF.sort_values('weight', ascending=False)
    print(sub + ' term weights sorted')
    
    keywords = []
    for row in sub_TFIDF.itertuples(index=False):
        if len(keywords) < 15 and row.term not in keywords:
            if row.term != 'hokule':
                keywords.append(row.term)
                
    # finding similar words and creating a csv file

    def compute_similar_words(model,source_word, topn=5):
        similar_words = [source_word]
        try:
            top_words = model.wv.most_similar(source_word, topn=topn)
            similar_words.extend([val[0] for val in top_words])
        except KeyError as err:
            print(err.args)
        return similar_words    

    def compute_similar_words_for_all_tasks(model, topn=5):
        columns = ['word' + str(i - 1) for i in range(1, topn + 2)]
        df = pd.DataFrame(data=None, columns=columns)
        for source_word in keywords:
            similar_words = compute_similar_words(model, source_word, topn)
            df.loc[len(df)] = similar_words
        df.to_csv('similar_words_task/subneighborhood_TFIDF/' + sub + '_similar_words.csv')
    
    words = compute_similar_words_for_all_tasks(model)
    print(sub + ' similar words to most important terms generated')
    print()

starting work with fenway

fenway term weights sorted
fenway similar words to most important terms generated

starting work with beacon_hill

beacon_hill term weights sorted
beacon_hill similar words to most important terms generated

starting work with downtown

downtown term weights sorted
downtown similar words to most important terms generated

starting work with south_boston

south_boston term weights sorted
south_boston similar words to most important terms generated

starting work with east_boston

east_boston term weights sorted
east_boston similar words to most important terms generated

starting work with back_bay

back_bay term weights sorted
back_bay similar words to most important terms generated

starting work with jamaica_plain

jamaica_plain term weights sorted
jamaica_plain similar words to most important terms generated

starting work with south_end

south_end term weights sorted
south_end similar words to most important terms generated

starting work with charlestown