In [None]:
import pandas as pd
import numpy as np
import string
#NLP
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import pos_tag

In [None]:
# Read Files
df_reviews=pd.read_csv('data/street_reviews.csv')
df_reviews.drop('Unnamed: 0', axis=1, inplace=True)
df_sample=pd.read_csv('data/data_seattle.csv')
df_sample.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df_reviews.columns

In [None]:
df_sample.columns

In [None]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        stemmer_porter = SnowballStemmer('english')
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: [stemmer_porter.stem(word) for word in analyzer(doc)]

In [None]:
# let's keep these 
#u'bath', u'bathroom',u'bed', u'bedroom', u'bedrooms', u'beds',u'br', u'ba', u'appliances', u'appls',
#       u'apps',

In [None]:
stopwords_ = set(stopwords.words('english'))
add_stopwords=[u'1st', u'2nd', u'3rd', u'alki', u'anne',
       u'arbor', u'area', u'atlantic', u'baker',
       u'ballard', u'bay', u'beach', u'beacon',
       u'belltown', u'bitter',
       u'blaine', u'blue', u'broadview', u'broadway', u'bryant',
       u'business', u'capitol', u'cedar', u'central', u'city', u'columbia',
       u'delridge', u'denny', u'district', u'downtown',
       u'east', u'eastlake', u'fairmount', u'fauntleroy', u'floor',
       u'floors', u'fremont', u'gatewood', u'genesee', u'georgetown',
       u'green', u'greenwood', u'haller', u'harrison', u'heights', u'high',
       u'highland', u'hill', u'hills', u'house', u'housing',
       u'interbay', u'international', u'lake', u'laurelhurst', u'leschi',
       u'licton', u'lower', u'madison', u'madrona', u'magnolia', u'mann',
       u'market', u'meadowbrook', u'minor', u'montlake', u'mount',
       u'neighborhood', u'north', u'northgate', u'olympic', u'park',
       u'phinney', u'pike', u'pioneer', u'point', u'portage', u'queen',
       u'rainier', u'ravenna', u'ridge', u'riverview', u'roxhill', u'sand',
       u'seattleadmiral', u'seaview', u'seward', u'south', u'springs',
       u'square', u'stevens', u'sunset', u'terrace', u'union',
       u'university', u'victory', u'wallingford', u'wedgeview', u'west',
       u'westlake', u'whittier', u'yesler']
for word in add_stopwords:
    stopwords_.add(word)

In [None]:
def fix_abrv(text_):
    return text_.replace('\\n','').replace(' ba ',' bathroom ').replace(' bdrm ', ' bedroom ')\
                .replace(' br ', ' bedroom ').replace( ' appls ', ' appliances ').replace(' appls ', ' appliances ')\
                .replace(' bdr ', ' bedroom ').replace(' flrs ', ' floors ').replace(' flr ', ' floor ')

In [None]:
def post_tag_nouns_adj(text_):
    return ' '.join([t[0] for t in pos_tag(text_.split())if t[1].startswith(('JJ','NN'))])

In [None]:
def remove_punctuations(text_):
    punctuations_=set(string.punctuation)
    return ''.join(word for word in text_ if word not in punctuations_)

In [None]:
def remove_digits(text_):
    return ' '.join(s for s in text_.split() if not any(c.isdigit() for c in s))

In [None]:
def clean_text(text_list,post_tagged=False):
    cleaned_text_list=[]
    for i in range(len(text_list)):
        review_fixed = fix_abrv(text_list[i])
        review_no_punc = remove_punctuations(review_fixed)
        if post_tagged==True:
            cleaned_text_list.append(post_tag_nouns_adj(review_no_punc))
        else:
            cleaned_text_list.append(remove_digits(review_no_punc))
    return cleaned_text_list

In [None]:
def top_three_dictionary(text_list,label_list,posttagged = False, ngrammin=2,ngrammax=2):
    ## get cleaned textreview
    textreview = clean_text(text_list,post_tagged=posttagged)
    
    ## stemmed vectorizer
    Stemmed_Vectorizer=StemmedTfidfVectorizer(stop_words=stopwords_,ngram_range=(ngrammin, ngrammin))
    Stemmed_Vectors=Stemmed_Vectorizer.fit_transform(textreview)
    Stemmed_Review_Vectors=Stemmed_Vectors.toarray()
    
    ## find 3 most similar items 
    n=3 
    similatiry_dict={}
    for i in range(len(label_list)):
        cos_sim = cosine_similarity(Stemmed_Review_Vectors[i:(i+1)], Stemmed_Review_Vectors)
        order = list(cos_sim.argsort()[0][::-1][1:n])
        top_three = label_list[order]
        similatiry_dict[label_list[i]]=top_three.values.tolist()
    return similatiry_dict

In [None]:
# User Input

In [None]:
neighborhood_dict=top_three_dictionary(df_reviews['reviews'],df_reviews['name'],posttagged = True)

In [None]:
minbed=1
maxbed=2
minbath=1
maxbath=2
prop_type='Residential'
neighborhood='Ballard'

In [None]:
def home_reccomender_dict(minbed,maxbed,minbath,maxbath,prop_type,neighborhood,stage=1):
    df_select= df_sample[(df_sample['bed']<=maxbed) & (df_sample['bed']>=minbed) &
               (df_sample['bath']<=maxbath) & (df_sample['bed']>=minbath) &
               (df_sample['prop_type']==prop_type)&
               (df_sample['street_neighborhood']==neighborhood)][['id','remarks']]
    df_select.reset_index(drop=True,inplace=True)
    reccomender_dictionary = top_three_dictionary(df_select['remarks'],df_select['id'])
    return reccomender_dictionary

In [None]:
def home_reccomender_part_i(reccomender_dictionary):
    return reccomender_dictionary.keys()

In [None]:
def home_reccomender_part_ii(reccomender_dictionary,selected_home,neigborhood):
    alt_neig_dict={}
    same_neihgborhood_reccomendation = reccomender_dictionary[selected_home]
    alt_neig_1, alt_neig_2 = neighborhood_dict[neighborhood]
    alt_neig_dict[alt_neig_1] = home_reccomender_dict(minbed,maxbed,minbath,maxbath,prop_type,alt_neig_1)
    alt_neig_dict[alt_neig_2] = home_reccomender_dict(minbed,maxbed,minbath,maxbath,prop_type,alt_neig_2)
    return (same_neihgborhood_reccomendation,alt_neig_dict)

In [None]:
mydict = home_reccomender_dict(minbed,maxbed,minbath,maxbath,prop_type,neighborhood)

In [None]:
home_reccomender_part_i(mydict)

In [None]:
home_reccomender_part_ii(mydict,240833,'Ballard')

In [None]:
# stop here, tasks to do :
# add selected home vs reccomended neighborhood home comparision
# carry this to atom 

In [None]:
def top_three_dictionary_old(text_list,label_list):
    '''
    input:
    text_list: list of string
    label_list: list of string
    output:
    dictionary of key = text label values: 3 similar labels
    '''
    #remove punctuations & numbers and keep only nouns and adjectives
    punctuations_=set(string.punctuation)
    textreview=[]
    for i in range(len(text_list)):
        #review_=' '.join([t[0] for t in pos_tag(text_list[i].split())if t[1].startswith(('JJ','NN'))])\
        review_=text_list[i].replace('\\n','').replace(' ba ',' bathroom ').replace(' bdrm ', ' bedroom ')\
        .replace(' br ', ' bedroom ').replace( ' appls ', ' appliances ').replace(' appls ', ' appliances ')\
        .replace(' bdr ', ' bedroom ').replace(' flrs ', ' floors ').replace(' flr ', ' floor ')
        
        review = ''.join(word for word in review_ if word not in punctuations_)
        textreview.append(' '.join(s for s in review.split() if not any(c.isdigit() for c in s)))
    
    ## stemmed vectorizer
    nmin=2
    nmax=2
    Stemmed_Vectorizer=StemmedTfidfVectorizer(stop_words=stopwords_,ngram_range=(nmin, nmax))
    Stemmed_Vectors=Stemmed_Vectorizer.fit_transform(textreview)
    Stemmed_Review_Vectors=Stemmed_Vectors.toarray()
    
    ## 3 most similar items 
    n=3 
    similatiry_dict={}
    for i in range(len(label_list)):
        cos_sim = cosine_similarity(Stemmed_Review_Vectors[i:(i+1)], Stemmed_Review_Vectors)
        order = list(cos_sim.argsort()[0][::-1][1:n])
        top_three = label_list[order]
        similatiry_dict[label_list[i]]=top_three.values.tolist()
    return similatiry_dict

In [None]:
## check neighborhoods based on reviews

In [None]:
similarity_dict=top_three_dictionary(df_reviews['reviews'],df_reviews['name'],posttagged = True)

In [None]:
#similarity_dict

In [None]:
# Find homes 1-2 bedroom 1-2 bathroom in Ballard

In [None]:
minbed=1
maxbed=2
minbath=1
maxbath=2
prop_type='Residential'
neighborhood='Ballard'

In [None]:
df_select= df_sample[(df_sample['bed']<=maxbed) & (df_sample['bed']>=minbed) &
               (df_sample['bath']<=maxbath) & (df_sample['bed']>=minbath) &
               (df_sample['prop_type']==prop_type)&
               (df_sample['street_neighborhood']==neighborhood)][['id','remarks']]
df_select.reset_index(drop=True,inplace=True)

In [None]:
df_select

In [None]:
listing_similarity_dict=top_three_dictionary(df_select['remarks'],df_select['id'])

In [None]:
listing_similarity_dict

In [None]:
df_sample[df_sample['id'].isin([5558,240833, 264174])][['id','bed','bath','sqft','address','latitude','longitude','price','selling_price','mls_prop_type']]

In [None]:
# check neighborhoods based on listing descriptions

In [None]:
label_list = np.unique(df_sample.street_neighborhood.values)
label_list = map(str, label_list)

In [None]:
remarks_list = []
for label in label_list:
    remarks_list.append(''.join(
            map(str, df_sample[df_sample['street_neighborhood']==label]['remarks'].values)
    ))

In [None]:
df_remarks = pd.DataFrame(
    {'label': label_list,
     'remarks': remarks_list
    })

In [None]:
df_remarks['label']=df_remarks['label'].astype('str')

In [None]:
df_remarks['remarks']=df_remarks['remarks'].astype('str')

In [None]:
similarity_dict_list=top_three_dictionary(df_remarks['remarks'],df_remarks['label'])

In [None]:
similarity_dict_list['Atlantic']

In [None]:
## did some search below this