In [1]:
import pandas as pd
import numpy as np

In [2]:
from pymongo import MongoClient
import pprint

In [3]:
# Connect to the hosted MongoDB instance
client = MongoClient()

In [4]:
db = client.Street_Advisor

In [5]:
reviews = db.neighborhood

In [6]:
allreviews = reviews.find({},{'_id': 0})
reviewlist=[]
for review in allreviews:
    reviewlist.append(review)

In [7]:
df=pd.DataFrame(reviewlist)

In [8]:
df = df[['reviews','name']]
df[0:10]

Unnamed: 0,reviews,name
0,[\nOk I’m gonna let you in on one of my runnin...,Alki
1,[\nWant a neighborhood close to the city with ...,Arbor Heights
2,"[\nLocated just north of Interstate 90, betwee...",Atlantic
3,[\nOk I had to write a review for Ballard just...,Ballard
4,"[\nThe area of Beacon Hill, known more specifi...",Beacon Hill
5,[\nSome of my colleagues and I occasionally li...,Belltown
6,[\nWe looked at many Shoreline and North Seatt...,Briarcrest
7,[\nCarkeek park in Broadview is one of our fav...,Broadview Bitter Lake
8,[\nThe Epicenter of Seattle Culture\n\nA place...,Broadway (Capitol Hill)
9,"[\nLake City has some junky parts, where you d...",Cedar Park / Meadowbrook


In [None]:
#Remove the neighborhoods dont have reviews

In [9]:
empty_review_list=[]
for i in range(df.shape[0]):
    if len(df['reviews'][i])==0:
        empty_review_list.append(df['name'][i])
empty_review_list

[u'Parkwood', u'Ridgecrest', u'Westminster Triangle']

In [None]:
# reset index

In [10]:
for i in empty_review_list:
    df = df[df.name != i]

In [11]:
df.reset_index(drop=True,inplace=True)

In [13]:
df[40:50]

Unnamed: 0,reviews,name
40,[\nThe Mt. Baker community is as different eco...,Mount Baker
41,[\nJust moved here recently because of the qua...,North Admiral
42,"[\nThere’s nowhere in Seattle that is as nice,...",North Beach Blue Ridge
43,[\nNorth Beacon Hill is another section of the...,North Beacon Hill
44,"[\nEveryone wants to live in the forest, well ...",North Delridge
45,[\nNorth Queen Anne has some lovely houseboats...,North Queen Anne
46,"[\nNorthgate is one of the biggest, most north...",Northgate
47,[\nThe greatest thing about the Olympia Hills ...,Olympic Hills / Victory Heights
48,[\nWe went to Phinney Ridge neighborhood to go...,Phinney Ridge / Greenwood
49,"[\nAfter the Space Needle, Pike Place Market i...",Pike Market


In [None]:
# TFID Vectorizer

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
import string
punctuations_=set(string.punctuation)
textreview=[]
for i in range(df.shape[0]):
    textreview.append(''.join(word for word in df['reviews'][i] if word not in punctuations_))

In [56]:
# remove all words which contains number
#import re
reviewtext=[]
for review in textreview:
    reviewtext.append(' '.join(s for s in review.split() if not any(c.isdigit() for c in s)))
#    reviewtext.append(re.sub(r'\w*\d\w*', '', review).strip())

In [57]:
vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(reviewtext)
review_vectors=vectors.toarray()

In [66]:
feature_array = np.array(vectorizer.get_feature_names())
tfidf_sorting = np.argsort(vectors.toarray()).flatten()[::-1]

n = 100
ordered_features = feature_array[tfidf_sorting][:n]

In [67]:
ordered_features

array([u'yesler', u'terrace', u'housing', u'area', u'subsidized',
       u'seattle', u'ghetto', u'community', u'public', u'neighborhood',
       u'american', u'projects', u'income', u'asian', u'teens', u'council',
       u'university', u'hill', u'african', u'redevelopment', u'calls',
       u'rate', u'developments', u'harborview', u'center', u'acres',
       u'economic', u'residents', u'plans', u'jackson', u'consists',
       u'development', u'build', u'people', u'low', u'asians', u'mixed',
       u'plan', u'rich', u'population', u'diversity', u'live',
       u'mcmansions', u'squashed', u'profanity', u'rowhouses',
       u'impoverished', u'redeveloped', u'harsh', u'proclaimed',
       u'betterment', u'deuce', u'redo', u'staging', u'brag', u'homework',
       u'spirits', u'drenched', u'investors', u'entrepreneurial',
       u'heritages', u'poorest', u'slum', u'succeeds', u'butts',
       u'suburbanites', u'proposing', u'prize', u'currently',
       u'interstate', u'considered', u'turn',

In [69]:
add_stopwords=[u'neighborhood',u'housing',u'area',u'seattle']

In [63]:
len(reviewtext)

71

In [70]:
from nltk.corpus import stopwords
stopwords_ = set(stopwords.words('english'))

In [71]:
for word in add_stopwords:
    stopwords_.add(word)

In [None]:
## Add Stemmed TFIDF Vecorizer

### SnowballStemmer

In [72]:
#Snowvall Stemmer
from nltk.stem.snowball import SnowballStemmer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        stemmer_porter = SnowballStemmer('english')
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: [stemmer_porter.stem(word) for word in analyzer(doc)]    

In [73]:
stemvectorizer=StemmedTfidfVectorizer(stop_words=stopwords_)
stemmedvectors=stemvectorizer.fit_transform(textreview)
stemmed_review_vectors=stemmedvectors.toarray()

In [74]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(stemmed_review_vectors[0:1], stemmed_review_vectors) 

In [75]:
n=3
neighborhoods_list = df['name']
sb_stemmed_similatiry_dict={}
for i in range(len(neighborhoods_list)):
    cos_sim = cosine_similarity(stemmed_review_vectors[i:(i+1)], stemmed_review_vectors)
    order = list(cos_sim.argsort()[0][::-1][1:n])
    top_three = neighborhoods_list[order]
    sb_stemmed_similatiry_dict[neighborhoods_list[i]]=top_three.values.tolist()

### PorterStemmer

In [55]:
from nltk.stem.porter import PorterStemmer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        stemmer_porter = PorterStemmer()
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: [stemmer_porter.stem(word) for word in analyzer(doc)]    

In [56]:
stemvectorizer=StemmedTfidfVectorizer(stop_words='english')
stemmedvectors=stemvectorizer.fit_transform(textreview)
stemmed_review_vectors=stemmedvectors.toarray()

In [58]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(stemmed_review_vectors[0:1], stemmed_review_vectors) 

In [59]:
n=3
neighborhoods_list = df['name']
ps_stemmed_similatiry_dict={}
for i in range(len(neighborhoods_list)):
    cos_sim = cosine_similarity(stemmed_review_vectors[i:(i+1)], stemmed_review_vectors)
    order = list(cos_sim.argsort()[0][::-1][1:n])
    top_three = neighborhoods_list[order]
    ps_stemmed_similatiry_dict[neighborhoods_list[i]]=top_three.values.tolist()

In [70]:
{k: sb_stemmed_similatiry_dict[k] for k in sb_stemmed_similatiry_dict.keys()[:2]}

{u'Cedar Park / Meadowbrook': [u'Green Lake',
  u'Downtown (Central Business District)'],
 u'Stevens (Capitol Hill)': [u'Lower Queen Anne',
  u'Downtown (Central Business District)']}

In [71]:
{k: ps_stemmed_similatiry_dict[k] for k in ps_stemmed_similatiry_dict.keys()[:2]}

{u'Cedar Park / Meadowbrook': [u'Green Lake',
  u'Downtown (Central Business District)'],
 u'Stevens (Capitol Hill)': [u'Lower Queen Anne',
  u'Downtown (Central Business District)']}

In [76]:
{k: sb_stemmed_similatiry_dict[k] for k in sb_stemmed_similatiry_dict.keys()[:2]}

{u'Cedar Park / Meadowbrook': [u'Green Lake', u'Haller Lake'],
 u'Stevens (Capitol Hill)': [u'Madison Park', u'Broadway (Capitol Hill)']}

In [31]:
n=3
neighborhoods_list = df['name']
similatiry_dict={}
for i in range(len(neighborhoods_list)):
    cos_sim = cosine_similarity(review_vectors[i:(i+1)], review_vectors)
    order = list(cos_sim.argsort()[0][::-1][1:n])
    top_three = neighborhoods_list[order]
    similatiry_dict[neighborhoods_list[i]]=top_three.values.tolist()

In [36]:
similatiry_dict['University District']

[u'Ravenna / Bryant', u'Montlake']

In [44]:
similatiry_dict['Alki']

[u'North Admiral', u'North Beach Blue Ridge']

In [48]:
stemmed_similatiry_dict['Alki']

[u'North Admiral', u'Downtown (Central Business District)', u'Green Lake']

In [49]:
stemmed_similatiry_dict['Downtown (Central Business District)']

[u'Pike Market', u'Broadway (Capitol Hill)', u'Green Lake']