# Homework 3 - ADM

## Import the Libraries

In [1]:
import csv
import nltk
from nltk.corpus import stopwords # import stopwords 
from string import punctuation # import punctuations
from nltk.corpus import wordnet # to check if a word is english
from nltk.stem import PorterStemmer #Porter stemming algorithm to remove and replace well-known suffixes of English words
from collections import defaultdict
import pandas as pd
import math #to do the log() inside the tf-idf function
import numpy as np # to do cosine similarity
import heapq # to do the max heap for the queue
from heapq import heappush
import datetime # for the fourth parameter in the Step 4
import time # for the fourth parameter in the Step 4
import folium # for the map
from geopy import distance # for the map

## Step 2: Read the csv file and create a new tsv file for each record

This part contains the preprocess of the documents by:

- Removing stopwords
- Removing punctuation
- Stemming
- Anything else it's needed

And when each correct document is cleaned we save into separate .tsv file

In [2]:
nltk.download('wordnet') # download the english words
nltk.download("stopwords") # download the stopwords

# create a set of english stopwords and punctuations 
stop_words = set(stopwords.words('english')+list(punctuation)) 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Clean part 1 - Cleaning function
This function takes in input a string and return the string without:

punctuation, non english words, stopwords and do the stemming process

In [3]:
def cleaning_function(unclnd_string):
    stemmer = PorterStemmer() # instantiate the PorterStemmer class
    clnd_string = ""  # create an empty string 
    
    for single_word in str(unclnd_string).split(): # read every feature word by word
        
        ##### PUNCTUATION CHECK 
        # the variable "wrong_char" is a set of the punctuation in "single_word"
        wrong_char = set(single_word).intersection(punctuation)
        # for loop for every wrong charachter contained in "single_word"
        for wrg in wrong_char:
            single_word = single_word.replace( wrg, "" ) # remove the punctuation 
        ##### ENGLISH OR NUMBER CHECK    
        if (not wordnet.synsets(single_word)) and not (single_word.isdigit()):
            # Not an English Word
            continue # Skip the word
        ##### STEMMING       
        single_word = stemmer.stem(single_word) # remove affixes from a word
        ##### STOPWORD 
        if (single_word not in stop_words) and (single_word != ''): # remove the word if it is a stopword 
            clnd_string = (clnd_string + " " + single_word.lower())  # append words in the string created earlier 
            
    clnd_string = clnd_string[1:] # remove the final space
        
    return(clnd_string)

#### Example


In [4]:
cleaning_function("$100")

'100'

In [5]:
cleaning_function("near by: 6/8 miles away. \\Airport: DFWLOVE.")

'near 68 mile away airport'

### Clean part 2 - Check the position of the features
As the data description shows, the order of the data for each row is:

row number, average rate per night, bedrooms count, city, date of listing, description, latitude, longitude, title, url

In this section we want erase all the reviews that don't respect this order

### Save new documents
After the clean is done we can save each file in a different .tsv file

In [6]:
N = 0 # the number of documents we store after the cleaning. We need this value after to compute the tf-idf's

with open('Airbnb_Texas_Rentals.csv', 'r', encoding="utf8") as csv_data:
    data_reader = csv.reader(csv_data, delimiter=',')
    for row in data_reader:
        try:
            # here we clean: average rate per night - title - description
            clnd_row_rate = cleaning_function( row[1] )
            clnd_row_title = cleaning_function( row[8] )
            clnd_row_descr = cleaning_function( row[5] )

            # check if the second element (average rate per night) cleaned is an integer
            clnd_row_rate == int(clnd_row_rate)
            # check if the third element (number of bedroom) is an integer
            row[2] == int(row[2])
            # check if the seventh end eigtht element (coordinates) are floating number
            row[6] == float(row[6])
            row[7] == float(row[7])

            # If everything go right i can store new checked and cleanead document
            new_line = ("\t").join( [ str(row[0]) , clnd_row_rate , str(row[2]) , row[3] , (row[4]), 
                                     clnd_row_descr, str(row[6]), str(row[7]), clnd_row_title, row[9] ] )
            open("tsv_documents/doc_" + str(N+1) + ".tsv", "w", encoding="utf8").writelines(new_line)
            N += 1
        except:
            # if one of our check goes wrong we skip to the next row
            continue

In [7]:
# Final number of documents stored is:
N 
# We know that there are a lot of duplicates but we remove them only in the results visualization

17198

# Step 3: Search Engine
## 3.1) Conjunctive query
### Create the vocabulary¶
Create the vocabulary where the keys are the words in the Title and in the Description and the values are a list of which documents contains the key word

In [8]:
# initialize a default dictionary
vocabulary =  defaultdict(list)

for j in range(1,N):
    # open each tsv file
    with open("tsv_documents/doc_" + str(j) + ".tsv" , encoding="utf8") as csvfile:
        # is the csv reader where we specify the separator/delimeter
        new_file = csv.reader(csvfile, delimiter='\t')
        # we read only the first line because each file has one line with all the information
        row1 = next(new_file)
        # i create a list with all the word separated of the Title and the Description
        entire_line = row1[8].split(" ") + row1[5].split(" ")
        # for each word in this list i append the number of the document that contain this word
        for word in entire_line:
            if j not in vocabulary[word]:
                vocabulary[word].append(j)

### Start with the first query
Given a query the Search Engine return a list of documents (in a dataframe) that contains all the words queried

In [9]:
# create empty dataframe to visualize the results of the query
df_results_1 = pd.DataFrame(columns=["Title","Description","City","Url"])
# take in input all the word in the query
print("\nWhat is your query?")
query = input()
# we clean the query
query = cleaning_function(query).split()
while query == []:
    print("\nWhat is your query?")
    query = input()
    query = cleaning_function(query).split()

# i want as output all the indices of the documents that contain ALL the words in the query
# i want to use the function intersection of the Set

# initialize the set with the indices of the documents for the first word
set_query = set(vocabulary[query[0]])
for word_query in query:
    # then i do the intersection with all new indices of the new word
    set_query = set_query.intersection(set(vocabulary[word_query]))
# at the end "set_query" contains the indices of the documents that contain all queried words

# now i want to read "Title","Description","City","Url" for each documents that i find
for j in set_query:
    with open("tsv_documents/doc_" + str(j) + ".tsv" , encoding="utf8") as csvfile:
        new_file = csv.reader(csvfile, delimiter='\t')
        new_row = next(new_file)
        # i go back to the original document to show in the dataframe the right description
        with open('Airbnb_Texas_Rentals.csv', 'r', encoding="utf8") as csv_data:
            data_reader = csv.reader(csv_data, delimiter=',')
            row1 = [row for idx, row in enumerate(data_reader) if idx == int(new_row[0])][0]
        # create a temporary dataframe that i want to concatenate with the final
        list_to_append = [[row1[8], row1[5], row1[3], row1[9]]]
        df_temp = pd.DataFrame(list_to_append, columns=["Title","Description","City","Url"] )       
        df_results_1 = pd.concat([df_results_1, df_temp], ignore_index=True)
        
# before the visualization of the results we drop the duplicate = the reviews that have the same title and description
df_results_1 = df_results_1.drop_duplicates(subset = ["Title","Description"])

# set the option of the dataframe's columns width to display more information
pd.options.display.max_colwidth = 260
df_results_1.head()



What is your query?
home near the Airport


Unnamed: 0,Title,Description,City,Url
0,Cozy&Comfort Private Rm. Near IAH,"This home is essential for the person who needs to be minutes from the airport (8min) and downtown (15min).\nLess than (1mi) from the nearest movie cinema, bowling, and eateries.\n\nComplimentary ride from the airport upon request.",Humble,https://www.airbnb.com/rooms/10888712?location=Channelview%2C%20TX
1,5* Quality Private Room available near IAH,My place is close to the airport. You’ll love my place because of the neighborhood and space in the room.\nFirst time Airbnb guests are always welcomed.\nPLEASE NOTE: All guests MUST BE verified Airbnb members with a visible profile picture before booking....,Humble,https://www.airbnb.com/rooms/13847391?location=Atascocita%2C%20TX
2,Family Home near Austin Airport,"Spacious home with wood floors, and fire place. Kitchen has full available appliances and living area has recliner, LED TV, local channels, &amp; Wifi all over the house. Master bedroom has King size bed, double sink, separate shower and a garden tub.",Austin,https://www.airbnb.com/rooms/8191636?location=Cedar%20Creek%2C%20TX
3,Cozy Home like Studio Apartment Near DFW Airport,"A very cozy, small studio apartment. More images of the space will be made available tonight. Just a Heads up There is no shower! only a men and women's restroom.",Irving,https://www.airbnb.com/rooms/16275996?location=Colleyville%2C%20TX
4,Valley Ranch / Irving home,"My place is close to family-friendly activities, such as the park and library, an international airport, entertainment venues and sports centers, . You’ll love my place because of the ambiance, the convenience. It is a perfect location near almost\n everyt...",Irving,https://www.airbnb.com/rooms/14692959?location=Coppell%2C%20TX


## 3.2) Conjunctive query & Ranking score
### Function for tf_idf of a term in a document

In [10]:
def tf_idf ( term, document , D ):
    # the document is passed array like: a list of all the words
    # D is the number of document that contain the term
    
    # find the frequency
    freq = document.count(term) / len(document)
    
    # tf_idf computation
    tf_idf = math.log(N/D) * freq
    
    return tf_idf

### Cosine similarty function

In [11]:
def cosine_similarity(x, y):
    
    dot_product = np.dot(x, y)
    normX = np.linalg.norm(x)
    normY = np.linalg.norm(y)
    
    return dot_product / (normX * normY)

### Create a new vocabulary
{

term_1:[(document1, tfIdf_{term,document1}), (document2, tfIdf_{term,document2}), (document4, tfIdf_{term,document4}), ...],

term_2:[(document1, tfIdf_{term,document1}), (document3, tfIdf_{term,document3}), (document5, tfIdf_{term,document5}), ...]

...}

In [12]:
# i do the same thing of the previous vocabulary but now the values are list of tuples like this (document_id, tf_idf)
new_vocabulary = defaultdict(list) # initialize new vocabulary as dictionary

for term in vocabulary.keys(): # searching in the previous vocabulary
    for docs in vocabulary[term]: # for each document that contain "term"
        with open("tsv_documents/doc_" + str(docs) + ".tsv" , encoding="utf8") as csvfile:
                new_file = csv.reader(csvfile, delimiter='\t')
                row1 = next(new_file)
                entire_line = row1[8].split(" ") + row1[5].split(" ")
                # compute the tf idf with the function for the term in the title and description of the document
                tf_idf_temp = tf_idf( term , entire_line, len(vocabulary[term]) )
                
        # append to list of the key "term" the tuple with doc_id and the corresponding tf idf value
        new_vocabulary[term].append( (docs, tf_idf_temp) ) 

### Do the search using cosine similarity & tf_idf values
Here the user put his input: words and k, to recive the top k results similar to the words requested

In [25]:
# take in input all the word in the query
print("\nWhat is your query?")
query = input()
# we clean the query
query = cleaning_function(query).split()
# check the query cleaned is 
while query == []:
    print("\nUncorrect input. What is your query?")
    query = input()
    query = cleaning_function(query).split()
    
# how much is k?
print("\nHow many results do you want display?")
k = input()
# check k is an integer
while not k.isdigit():
    print("\nUncorrect input. How many results do you want display?")
    k = input()
    
k = int(k)

# i search the documents that have all the words querid with the old vocabulary
set_query = set(vocabulary[query[0]])
for word_query in query:
    # then i do the intersection with all new indices of the new word
    set_query = set_query.intersection(set(vocabulary[word_query]))
# at the end "set_query" contains the indices of the documents that contain all queried words

#### KIND OF OUTPUT I WANT: [ [ (doc_id_1 , tf_idf_word_1), (doc_id_1 , tf_idf_word_2) ],
#                             [ (doc_id_2 , tf_idf_word_1), (doc_id_2 , tf_idf_word_2) ],
#                             .... ]

# for each document that has all the queried word i store the tf_idfs for these words
list_tf_idf_all_words = []

# The explanation of this is done after this chunk of code

for doc in set_query:
    list_tf_idf_word_query = [doc]
    for word_query in query: 
        for all_tuples in new_vocabulary[word_query]:
            if all_tuples[0] == doc:
                list_tf_idf_word_query.append(all_tuples[1])
    list_tf_idf_all_words.append(list_tf_idf_word_query)

# at the end "list_tf_idf_all_words" is a list of lists each element correspond to a document that have all the words in the query
# To explain in detail, the i-th sublist is the tf_idfs for all the word in the i-th document
#
# Example:
#    INPUT/QUERY: IAH airport
#    SET_QUERY = {1, 54} it means that only the first and the 54-th documents have the 2 word of the query
#
#    Look inside the first document in the "set_query" (doument_id = 1)
#    now i look for the tf_idf for the first word ( IAH ):
#        new_vocabulary["IAH"] = [(1, 0.1313826867293143), (54, 0.033184286854311344), (56, 0.03878163644419519), (86, 0.03657813437350228)]
#        i append in "list_tf_idf_all_words" this list: 0.1313826867293143
#    now look for the tf_idf for the second word ( airport ):
#        new_vocabulary["airport"] = [(1, 0.05741654524000074), (41, 0.2009579083400026), (54, 0.029004234193402437), (76, 0.06698596944666753), (84, 0.10820810449077063), (95, 0.06698596944666753)]
#        i append in "list_tf_idf_all_words" this list: 0.05741654524000074
#
#    OUTPUT: list_tf_idf_all_words = [[1, 0.05741654524000074, 0.1313826867293143], [54, 0.029004234193402437, 0.033184286854311344]]
#    In the first position of each sublist i have the document_id then all the tf_idf for all the words in the query


# Now i want to create the tf_idf for the words in the query. To do this i use the query as a document
list_tf_idf_query = []
for word_query in query:
    list_tf_idf_query.append(tf_idf(word_query, query, D=1))

# cosine similarity of all the document
cosine_sim_list = []
for element in list_tf_idf_all_words:
    cosine_sim_list.append( (cosine_similarity( list_tf_idf_query, element[1:]), element[0]) )


What is your query?
home near the AIRPORT!

How many results do you want display?
20


In [26]:
# This procedure basically is the same done before

# create empty dataframe to visualize the results of the query
df_results_2 = pd.DataFrame(columns=["Doc ID", "Title", "Description", "City", "Url", "Similarity"])
for elem in cosine_sim_list:    
    with open("tsv_documents/doc_" + str(elem[1]) + ".tsv" , encoding="utf8") as csvfile:
        new_file = csv.reader(csvfile, delimiter='\t')
        new_row = next(new_file)
        
        with open('Airbnb_Texas_Rentals.csv', 'r', encoding="utf8") as csv_data:
            data_reader = csv.reader(csv_data, delimiter=',')
            row1 = [row for idx, row in enumerate(data_reader) if idx == int(new_row[0])][0]
        
        # create a temporary dataframe that i want to concatenate with the final
        list_to_append = [[elem[1],row1[8], row1[5], row1[3], row1[9], elem[0]]]
        df_temp = pd.DataFrame(list_to_append, columns=["Doc ID", "Title", "Description", "City", "Url", "Similarity"] )       
        df_results_2 = pd.concat([df_results_2, df_temp], ignore_index=True)
        
df_results_2 = df_results_2.drop_duplicates(subset = ["Title", "Description"])  

# Store all the tuple in a max heap structure for the top k similar results
# ATTENTION! the max is evaluated on the first element of the tuples
maxheap = []
for i in df_results_2[["Doc ID", "Similarity"]].iterrows():
    heappush(maxheap, (i[1][1], i[1][0]))
heapq._heapify_max(maxheap)

list_of_top_k = []
for i in range(k):
    # Now i take the first element (the root) and pop it for k times
    list_of_top_k.append(heapq._heappop_max(maxheap)[1])
    if maxheap == []:
        break

df_results_2.set_index("Doc ID", inplace=True)
df_results_2.loc[list_of_top_k]

Unnamed: 0_level_0,Title,Description,City,Url,Similarity
Doc ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6020,Spacious two story home near IAH,"Our red brick, cool, quiet home is minutes away from Bush Intercontinental Airport. And a hop, skip away to the major freeway, 59. With a great selection of stores and restaurants around the corner. Only 15-20 minutes from Downtown Houston.",Humble,https://www.airbnb.com/rooms/4227275?location=Cleveland%2C%20TX,0.998928
16306,Home near Galveston Beach and other attractions,Nice 4 Bedroom 2 Bath home in a great location close to freeways. \n5 min to Tanger Outlet Mall\n15 min from Galveston Beach/Schlitterbahn/Moody Gardens / Texas City Dike\n18 minutes to NASA\n23 min Kema Boardwalk\n35 min Houston Hobby Airport\n50 min to H...,Texas City,https://www.airbnb.com/rooms/18076465?location=Bayou%20Vista%2C%20TX,0.998928
7732,Centrally Located Family home,"New home in quiet, safe neighborhood near shopping, restaurants, Field of Dreams baseball fields, Hawaiian Falls, Six Flags, TX Rangers, Dallas Cowboys, &amp; DFW airport. Fort Worth and Dallas are each about 20 minutes away. Has community pool.",Arlington,https://www.airbnb.com/rooms/7332566?location=Cedar%20Hill%2C%20TX,0.998928
6187,Nice Vacation Home with Pool Near Bush Airport,This is a very nice family oriented home. Great for vacation getaways. It has an enormous pool in the backyard starting from 4ft to 9ft. It also consist of a cabana area whereas you can sit and lounge a party of 8 people or more. The entire house can be us...,Spring,https://www.airbnb.com/rooms/16269086?location=Cleveland%2C%20TX,0.998928
5859,Copper Cabin,Copper Cabin is a relaxing get away destination located 10 miles from the Abilene Airport and the Taylor County Expo Center off highway 36. Large front and back porches welcome you to all the comforts of home away from home. Wedding venues located near the...,Abilene,https://www.airbnb.com/rooms/17773872?location=Cisco%2C%20TX,0.998928
5372,Amazing 4 Bedroom Home near Addison,"Large and open space, nice and clean one story home in quiet neighborhood. close to shopping centers, moving theater, restaurants. 2.5 miles from Addition. 20 minutes from DFW airport, easy to get on highway I-35 and The President George Bush HWY.",Carrollton,https://www.airbnb.com/rooms/9295130?location=Carrollton%2C%20TX,0.998928
48,Beautiful 4 Br/4 Ba Home Near Major Highways,My home is in a very family oriented neighborhood with access to major highways (2 miles to 75 and 4 miles to 121); 45 minutes to DFW and Dallas Love airports and 45 minutes to downtown Dallas. Enjoy an amazing range of shopping within 10-20 minutes - Prem...,Allen,https://www.airbnb.com/rooms/9793383?location=Celina%2C%20TX,0.998928
17100,Home away from Home.,"My best friend is my co-host &amp; she runs it. Reasonably priced lower than a hotel night. This will be a 5 STAR experience for you. Sleep in late, checkout at noon. Clean &amp; comfy private room and restroom for a night in Houston. 20 minutes from IAH a...",Humble,https://www.airbnb.com/rooms/18997638?location=Atascocita%2C%20TX,0.998928
15130,"Vintage Smart Home, near Downtown-Solar Powered","Vintage Solar Powered Smart Home central to EVERYTHING! 3 Bedrooms, 2.5 baths. Walking distance to restaurants, local cafes, hip antique shops, 7-10 min drive to Downtown, Riverwalk, 13 min to the Pearl &amp; 14 min to the airport. Chris Madrid's, restaura...",San Antonio,https://www.airbnb.com/rooms/14870808?location=Alamo%20Heights%2C%20TX,0.998928
7693,Family Home near Austin Airport,"Spacious home with wood floors, and fire place. Kitchen has full available appliances and living area has recliner, LED TV, local channels, &amp; Wifi all over the house. Master bedroom has King size bed, double sink, separate shower and a garden tub.",Austin,https://www.airbnb.com/rooms/8191636?location=Cedar%20Creek%2C%20TX,0.998928


### Consideration
We can see that generally the top results have a equal Similarity score. This could happens because the growth of logarithm it isn't really quick with great number ( and we use N ~ 17000 ). 

In this we can't distingue what's the best of the bests. However in the next step we try to define a new score to have a better division!

# Step 4: Define a new score!

- **First Parameter**
In this part we can search to the results that don't contain mandatorily ALL the words in the query. So we look for the more correlated

- **Second Parameter**
We search the results with the same number of bedroom conuts

- **Third parameter**
We give a better score to the house that cost lesser than the prefered amount of the user (we could combine the average per night for each person dividing average amount by number of bedroom

- **Fourth parameter**
We give more relevance to the latest reviews

- **Fifth parameter**
Use the coordinates to see near points of interest

_N.B._
All the scores we compute is among 0 and 1. Because in this way we have more uniform weight as possible, for the different parameters.

### Results' dictionary
We store the results for each request in a dictionary like this:

**KEY** = Document id : **VALUE** = List of all the scores for each parameter

redult_dict = {

doc_id_1: [ score_of_the_similarity_with_the_query,

            score_of_bedroom_count, 

            score_of_amount, 

            score_of_review_date, 

            score_for_the point_of_interest ], 

doc_id_2 : ... }

### First Parameter
The first parameter is the cosine similarity done previously

In [28]:
####### This is the same code done before. For any explanation looks the previous cells

# take in input all the word in the query
print("\nWhat is your query?")
query = input()
# we clean the query
query = cleaning_function(query).split()
# check the query cleaned is 
while query == []:
    print("\nUncorrect input. What is your query?")
    query = input()
    query = cleaning_function(query).split()

set_query = set(vocabulary[query[0]])
for word_query in query:
    set_query = set_query.intersection(set(vocabulary[word_query]))
list_tf_idf_all_words = []
for doc in set_query:
    list_tf_idf_word_query = [doc]
    for word_query in query: 
        for all_tuples in new_vocabulary[word_query]:
            if all_tuples[0] == doc:
                list_tf_idf_word_query.append(all_tuples[1])
    list_tf_idf_all_words.append(list_tf_idf_word_query)
    
list_tf_idf_query = []
for word_query in query:
    list_tf_idf_query.append(tf_idf(word_query, query, len(vocabulary[term]) + 1))
    
cosine_sim_list = []
for element in list_tf_idf_all_words:
    cosine_sim_list.append( (cosine_similarity( list_tf_idf_query, element[1:]), element[0]) )

df_results = pd.DataFrame(columns=["Doc ID", "Similarity"])
for elem in cosine_sim_list:    
    with open("tsv_documents/doc_" + str(elem[1]) + ".tsv" , encoding="utf8") as csvfile:
        new_file = csv.reader(csvfile, delimiter='\t')
        row1 = next(new_file)       
        list_to_append = [[elem[1], elem[0]]]
        df_temp = pd.DataFrame(list_to_append, columns=["Doc ID", "Similarity"] )       
        df_results = pd.concat([df_results, df_temp], ignore_index=True)

# Inizialize the final dataframe with all the scores        
df_results_score = pd.DataFrame(columns=["doc ID", "Similarity Score", "Bedroom count Score", 
                                         "Average per night Score", "Date review Score", 
                                         "Point of interest Score","Final Score"])
df_results_score["doc ID"] = df_results["Doc ID"]
df_results_score["Similarity Score"] = df_results["Similarity"]
df_results_score.set_index("doc ID", inplace=True)
df_results_score.head()



What is your query?
home near the airport


Unnamed: 0_level_0,Similarity Score,Bedroom count Score,Average per night Score,Date review Score,Point of interest Score,Final Score
doc ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3078,0.871576,,,,,
13834,0.953668,,,,,
7693,0.998928,,,,,
4112,0.953668,,,,,
5650,0.953668,,,,,


### Second Parameter
For now we consider the documents that contain all the words in the query. When the first parameter is done we use those documents.

__SCORE__:

- if the number is the same of the query ---> score_bedroom = 1
- if the number is equal +- 1 ---> score_bedroom = 1/2
- if the number is equal +- 2 ---> score_bedroom = 1/4
- ...

In general if i = abs( number of bedroom queried - number of bedroom in the document ) then score_bedroom = 2**(-i)

In [29]:
print("How many bedroom do you want?\n")
bedroom_queried = int(input())

for j in set_query:
    with open("tsv_documents/doc_" + str(j) + ".tsv" , encoding="utf8") as csvfile:
        new_file = csv.reader(csvfile, delimiter='\t')
        bedroom_count = next(new_file)[2] 
        difference = abs( bedroom_queried - int(bedroom_count) )
        df_results_score["Bedroom count Score"][j] = 2**(-difference)
        
df_results_score.head()

How many bedroom do you want?

2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,Similarity Score,Bedroom count Score,Average per night Score,Date review Score,Point of interest Score,Final Score
doc ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3078,0.871576,0.5,,,,
13834,0.953668,0.5,,,,
7693,0.998928,0.5,,,,
4112,0.953668,0.5,,,,
5650,0.953668,0.125,,,,


### Third Parameter
__SCORE:__

- if number of the amount in the document is the same of the query ---> score_amount = 1/2
- if the difference is among 1 and 4 ---> score_amount = 1/4
- if the difference is among -1 and -4 ---> score_amount = 1 - 1/4 = 3/4
- if the difference is among 5 and 9 ---> score_amount = 1/8
- if the difference is among -5 and -9 ---> score_amount = 1 - 1/8 = 7/8
- ...

In general if difference = ( amount in the document - amount queried ), then score_amount is defined by a negative exponential with base equal to 2 and argument the difference in modulo 5. 

There are other coefficient but these are present for programming reasons.

In [30]:
def amount_score( difference ):
    if difference > 0:
        return 2**(- ( difference // 5 + 2 ) )
    elif difference < 0:
        return 1 - 2**(((difference-1) // 5) - 1)
    else:
        return 2**(- ( difference // 5 + 1) )

In [31]:
print("What is the amount per night you want to spend more or less?\n")
amount_queried = int(input())

for j in set_query:
    with open("tsv_documents/doc_" + str(j) + ".tsv" , encoding="utf8") as csvfile:
        new_file = csv.reader(csvfile, delimiter='\t')
        average_doc = next(new_file)[1]
        difference = ( int(average_doc) - amount_queried )
        df_results_score["Average per night Score"][j] = amount_score(difference)
        
df_results_score.head()

What is the amount per night you want to spend more or less?

100


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,Similarity Score,Bedroom count Score,Average per night Score,Date review Score,Point of interest Score,Final Score
doc ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3078,0.871576,0.5,0.996094,,,
13834,0.953668,0.5,0.998047,,,
7693,0.998928,0.5,0.0078125,,,
4112,0.953668,0.5,0.996094,,,
5650,0.953668,0.125,1.19209e-07,,,


### Fourth parameter
We want to give more relevance to the recent reviews, because in general these are more similar to the actual status of the home.

__SCORE:__

We give an exponential weight where the independent variable is the difference in semester

i.e. to explain the score function we assume as current month november 2018.

- if the month of the review is june, july, august, september, october, november 2018 ---> score_date = 1
- if the month of the review is december 2017 or january, february, march, april, may 2018 ---> score_date = 1/2
- if the month of the review is june, july, august, september, october, november 2017 ---> score_date = 1/4
- ...

In general if difference = how many semesters the two dates are distant, then the parameter score is a negative exponential with base equal to 2 and argument the difference.

In [32]:
# Here we don't need any input
date_now = datetime.datetime.strptime( time.strftime("%B %Y"), "%B %Y") 

for j in set_query:
    with open("tsv_documents/doc_" + str(j) + ".tsv" , encoding="utf8") as csvfile:
        new_file = csv.reader(csvfile, delimiter='\t')
        date_string = next(new_file)[4]
        try:
            date_doc = datetime.datetime.strptime(date_string, "%B %Y")
        except:
            date_doc = datetime.datetime.strptime("january 1900", "%B %Y")
            
        difference = ((date_now.year - date_doc.year) * 12 + date_now.month - date_doc.month)//6
        df_results_score["Date review Score"][j] = 2**(-difference)
        
df_results_score.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,Similarity Score,Bedroom count Score,Average per night Score,Date review Score,Point of interest Score,Final Score
doc ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3078,0.871576,0.5,0.996094,0.03125,,
13834,0.953668,0.5,0.998047,0.00195312,,
7693,0.998928,0.5,0.0078125,0.015625,,
4112,0.953668,0.5,0.996094,0.125,,
5650,0.953668,0.125,1.19209e-07,0.0625,,


### Fifth Parameter
According to this Wikipedia page __[List of National Historic Landmarks in Texas](https://en.wikipedia.org/wiki/List_of_National_Historic_Landmarks_in_Texas)__, we consider the points of interest in Texas and using the coordinates we can measure the distance between each point of interest and a house in a review.

In this way we give a "point of interest score" for each house and this calculation is independent with the query so we compute and store previously.

SCORE:

Let be
- D = total number of points of interest 
- N = number of points of interest, in a radius of 100 miles, from the house location

Then Point of interest score = N/D * 3 ( we multiply by 3 because the maximum score is 0.33333 )

### The Dictionary of the Points of Interest

In [33]:
# Create a dict for each point of interest
POI_dict = { 
    "Alamo" : (29.426058, -98.486084),
    "Apollo Mission Control Center" : (29.556471, -95.08846),
    "Bastrop State Park" : (30.110833, -97.273611),
    "USS Cabot" : (25.930278, -97.484444),
    "Dealey Plaza Historic District" : (32.778611, -96.808333),
    "East End Historic District" : (29.304444, -94.782778),
    "ELISSA (Bark)" : (29.333255, -94.777452),
    "Espada Aqueduct" : (29.332523, -98.461469),
    "Fair Park Texas Centennial Buildings" : (32.781944, -96.765556),
    "Fort Belknap" : (33.150775, -98.741211),
    "Fort Brown" : (25.898333, -97.492222),
    "Fort Concho" : (31.452778, -100.429167),
    "Fort Davis" : (30.595833, -103.925833),
    "Fort Richardson" : (33.208056, -98.164722),
    "Fort Sam Houston" : (29.476255, -98.43083),
    "John Nance Garner House" : (29.212152, -99.791837),
    "Governor's Mansion" : (30.272318, -97.742708),
    "HA. 19 (Midget Submarine)" : (30.272222, -98.868333),
    "Hangar 9, Brooks Air Force Base" : (29.342129, -98.443645),
    "Harrell Site" : (33.004444, -98.668333),
    "Highland Park Shopping Village" : (32.835833, -96.805556),
    "J A Ranch" : (34.816667, -101.188056),
    "Lyndon Baines Johnson Boyhood Home" : (30.240833, -98.624167),
    "King Ranch" : (27.518611, -97.916944),
    "Landergin Mesa" : (35.245556, -102.425),
    "USS Lexington" : (27.815, -97.388611),
    "Lubbock Lake Site" : (33.621944, -101.889722),
    "Lucas Gusher, Spindletop Oil Field" : (30.019167, -94.073889),
    "Majestic Theatre" : (29.42646, -98.490713),
    "Mission Concepcion" : (29.390888, -98.49276),
    "Jose Antonio Navarro House Complex" : (29.422778, -98.496944),
    "Palmito Ranch Battlefield" : (25.946667, -97.285278),
    "Palo Alto Battlefield" : (26.021389, -97.480556),
    "Plainview Site" : (34.191111, -101.718889),
    "Walter C. Porter Farm" : (32.777778, -96.274444),
    "Presidio Nuestra Senora De Loreto De La Bahia" : (28.646667, -97.381667),
    "Randolph Field Historic District" : (29.532222, -98.28),
    "Samuel T. Rayburn House" : (33.567967, -96.207174),
    "Resaca De La Palma Battlefield" : (25.9375, -97.486111),
    "Roma Historic District" : (26.406111, -99.018056),
    "San Jacinto Battlefield" : (29.748889, -95.080278),
    "Space Environment Simulation Laboratory, Chambers A and B" : (29.559003, -95.0881),
    "Spanish Governor's Palace" : (29.425082, -98.49457),
    "Strand Historic District" : (29.306389, -94.793611),
    "USS TEXAS" : (29.754217, -95.089499),
    "Texas State Capitol" : (30.272734, -97.741078),
    "Trevino-Uribe Rancho" : (27.045, -99.443333),
    "Woodland" : (30.714722, -95.552778)
}

### We make give a value to each house checking how many points of interest are in the radius of 100 miles
This is really really expensive in terms of time but it is independent of the query. So it's already done when the user inputs the query. Moreover this dataframe is also used in the bonus part.

In [34]:
# create empty dataframe with all the latitudes and longitudes
lat_long_df = pd.DataFrame(columns=["doc_id","Latitude", "Longitude", "Near Point of Interest", "POI_score"])
D = len(POI_dict)

for i in range(1,N):
    with open("tsv_documents/doc_" + str(i) + ".tsv" , encoding="utf8") as csvfile:
        new_file = csv.reader(csvfile, delimiter='\t')
        row1 = next(new_file)
        # create a temporary dataframe that i want to concatenate with the final
               
        list_POI = []
        for v in POI_dict:
            if row1[6] != 'NA' and row1[7] !='NA':
                diss = distance.distance( (row1[6], row1[7]) , POI_dict[v] ).miles
                if diss <= 100:
                    list_POI.append(v)
        
        list_to_append = [[i, row1[6], row1[7], list_POI, len(list_POI)/D * 3]]
        df_temp = pd.DataFrame(list_to_append, columns=["doc_id","Latitude", "Longitude", "Near Point of Interest", "POI_score"] )
        lat_long_df = pd.concat([lat_long_df, df_temp], ignore_index=True)
lat_long_df.set_index("doc_id", inplace=True)

#### Example

In [35]:
lat_long_df.head()

Unnamed: 0_level_0,Latitude,Longitude,Near Point of Interest,POI_score
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,30.0201379199512,-95.2939960042513,"[Apollo Mission Control Center, East End Historic District, ELISSA (Bark), Lucas Gusher, Spindletop Oil Field, San Jacinto Battlefield, Space Environment Simulation Laboratory, Chambers A and B, Strand Historic District, USS TEXAS, Woodland]",0.5625
2,29.5030676756061,-98.4476879378504,"[Alamo, Bastrop State Park, Espada Aqueduct, Fort Sam Houston, John Nance Garner House, Governor's Mansion, HA. 19 (Midget Submarine), Hangar 9, Brooks Air Force Base, Lyndon Baines Johnson Boyhood Home, Majestic Theatre, Mission Concepcion, Jose Antonio N...",1.0
3,29.8293522272149,-95.0815494887563,"[Apollo Mission Control Center, East End Historic District, ELISSA (Bark), Lucas Gusher, Spindletop Oil Field, San Jacinto Battlefield, Space Environment Simulation Laboratory, Chambers A and B, Strand Historic District, USS TEXAS, Woodland]",0.5625
4,30.6373042787676,-96.3378459729631,"[Bastrop State Park, Governor's Mansion, San Jacinto Battlefield, USS TEXAS, Texas State Capitol, Woodland]",0.375
5,32.7470973543511,-97.2864343970125,"[Dealey Plaza Historic District, Fair Park Texas Centennial Buildings, Fort Belknap, Fort Richardson, Harrell Site, Highland Park Shopping Village, Walter C. Porter Farm, Samuel T. Rayburn House]",0.5


### Add the fifth Parameter to the Score dataframe

In [36]:
for j in set_query:
    df_results_score["Point of interest Score"][j] = lat_long_df["POI_score"][j]
df_results_score.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,Similarity Score,Bedroom count Score,Average per night Score,Date review Score,Point of interest Score,Final Score
doc ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3078,0.871576,0.5,0.996094,0.03125,0.5625,
13834,0.953668,0.5,0.998047,0.00195312,0.5625,
7693,0.998928,0.5,0.0078125,0.015625,0.875,
4112,0.953668,0.5,0.996094,0.125,0.4375,
5650,0.953668,0.125,1.19209e-07,0.0625,0.4375,


### Compute the Final Score and Visualize the results

In [45]:
df_results_score["Final Score"] = ( df_results_score["Similarity Score"] + df_results_score["Bedroom count Score"] + 
                                    df_results_score["Average per night Score"] + df_results_score["Date review Score"] + 
                                    df_results_score["Point of interest Score"]  ) / 5
df_results_score.head()

Unnamed: 0_level_0,Similarity Score,Bedroom count Score,Average per night Score,Date review Score,Point of interest Score,Final Score
doc ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3078,0.871576,0.5,0.996094,0.03125,0.5625,0.592284
13834,0.953668,0.5,0.998047,0.00195312,0.5625,0.603234
7693,0.998928,0.5,0.0078125,0.015625,0.875,0.479473
4112,0.953668,0.5,0.996094,0.125,0.4375,0.602452
5650,0.953668,0.125,1.19209e-07,0.0625,0.4375,0.315734


In [51]:
# how much is k?
print("\nHow many results do you want display?")
k = input()
# check k is an integer
while not k.isdigit():
    print("\nUncorrect input. How many results do you want display?")
    k = input()
    
k = int(k)

# create empty dataframe to visualize the results of the query
columns = ["Doc ID", "Title", "Number of Bedrooms", "Average Rate per Night", "City", "Description", "Near Point of Interest", "Url"]
df_results_3 = pd.DataFrame(columns=columns)
for elem in list(df_results_score.index):    
    with open("tsv_documents/doc_" + str(elem) + ".tsv" , encoding="utf8") as csvfile:
        new_file = csv.reader(csvfile, delimiter='\t')
        new_row = next(new_file)
   
        with open('Airbnb_Texas_Rentals.csv', 'r', encoding="utf8") as csv_data:
            data_reader = csv.reader(csv_data, delimiter=',')
            row1 = [row for idx, row in enumerate(data_reader) if idx == int(new_row[0])][0]
        
        # create a temporary dataframe that i want to concatenate with the final
        list_to_append = [[elem, row1[8], row1[2], row1[1], row1[3], row1[5], lat_long_df["Near Point of Interest"][elem], row1[9]]]
        df_temp = pd.DataFrame(list_to_append, columns=columns )       
        df_results_3 = pd.concat([df_results_3, df_temp], ignore_index=True)

maxheap = []
for i in df_results_score["Similarity Score"].iteritems():
    heappush(maxheap, (i[1], i[0]))
heapq._heapify_max(maxheap)

list_of_top_k = []
for i in range(k):
    # Now i take the first element (the root) and pop it for k times
    list_of_top_k.append(heapq._heappop_max(maxheap)[1])
    if maxheap == []:
        break

df_results_3.set_index("Doc ID", inplace=True)
df_results_3.loc[list_of_top_k]


How many results do you want display?
10


Unnamed: 0_level_0,Title,Number of Bedrooms,Average Rate per Night,City,Description,Near Point of Interest,Url
Doc ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
17010,Home in Denton 27 min. N of airport,1,$60,Corinth,"Want to get outside of the city after working hard or shopping hard? This is located in a Oakmont Estates, a Golf Course Community. Tennis courts, pool and Clubhouse about 20 minutes from Dallas Galleria Mall. Beautiful quiet neighborhood! Close to Lake Le...","[Dealey Plaza Historic District, Fair Park Texas Centennial Buildings, Fort Belknap, Fort Richardson, Harrell Site, Highland Park Shopping Village, Walter C. Porter Farm, Samuel T. Rayburn House]",https://www.airbnb.com/rooms/635415?location=Argyle%2C%20TX
16306,Home near Galveston Beach and other attractions,4,$97,Texas City,Nice 4 Bedroom 2 Bath home in a great location close to freeways. \n5 min to Tanger Outlet Mall\n15 min from Galveston Beach/Schlitterbahn/Moody Gardens / Texas City Dike\n18 minutes to NASA\n23 min Kema Boardwalk\n35 min Houston Hobby Airport\n50 min to H...,"[Apollo Mission Control Center, East End Historic District, ELISSA (Bark), Lucas Gusher, Spindletop Oil Field, San Jacinto Battlefield, Space Environment Simulation Laboratory, Chambers A and B, Strand Historic District, USS TEXAS, Woodland]",https://www.airbnb.com/rooms/18076465?location=Bayou%20Vista%2C%20TX
16202,Centrally Located Family home,3,$200,Arlington,"New home in quiet, safe neighborhood near shopping, restaurants, Field of Dreams baseball fields, Hawaiian Falls, Six Flags, TX Rangers, Dallas Cowboys, &amp; DFW airport. Fort Worth and Dallas are each about 20 minutes away. Has community pool.","[Dealey Plaza Historic District, Fair Park Texas Centennial Buildings, Fort Richardson, Harrell Site, Highland Park Shopping Village, Walter C. Porter Farm, Samuel T. Rayburn House]",https://www.airbnb.com/rooms/7332566?location=Arlington%2C%20TX
13691,Beautiful 4 Br/4 Ba Home Near Major Highways,4,$300,Allen,My home is in a very family oriented neighborhood with access to major highways (2 miles to 75 and 4 miles to 121); 45 minutes to DFW and Dallas Love airports and 45 minutes to downtown Dallas. Enjoy an amazing range of shopping within 10-20 minutes - Prem...,"[Dealey Plaza Historic District, Fair Park Texas Centennial Buildings, Fort Richardson, Highland Park Shopping Village, Walter C. Porter Farm, Samuel T. Rayburn House]",https://www.airbnb.com/rooms/9793383?location=Allen%2C%20TX
12228,Copper Cabin,2,$200,Abilene,Copper Cabin is a relaxing get away destination located 10 miles from the Abilene Airport and the Taylor County Expo Center off highway 36. Large front and back porches welcome you to all the comforts of home away from home. Wedding venues located near the...,"[Fort Belknap, Fort Concho, Harrell Site]",https://www.airbnb.com/rooms/17773872?location=Abilene%2C%20TX
7732,Centrally Located Family home,3,$200,Arlington,"New home in quiet, safe neighborhood near shopping, restaurants, Field of Dreams baseball fields, Hawaiian Falls, Six Flags, TX Rangers, Dallas Cowboys, &amp; DFW airport. Fort Worth and Dallas are each about 20 minutes away. Has community pool.","[Dealey Plaza Historic District, Fair Park Texas Centennial Buildings, Fort Richardson, Harrell Site, Highland Park Shopping Village, Walter C. Porter Farm, Samuel T. Rayburn House]",https://www.airbnb.com/rooms/7332566?location=Cedar%20Hill%2C%20TX
6373,Superbowl LI rental w/private pool!,5,$10000,Houston,"My place is close to Coveniently located near William P. Hobby airport, 610 loop and IH-45 South. . You’ll love my place because its a gorgeous 3 bedroom 2 bath home including a 2 bedroom 1 bath apartment, perfect for your stay in the Houston area. This ho...","[Apollo Mission Control Center, East End Historic District, ELISSA (Bark), Lucas Gusher, Spindletop Oil Field, San Jacinto Battlefield, Space Environment Simulation Laboratory, Chambers A and B, Strand Historic District, USS TEXAS, Woodland]",https://www.airbnb.com/rooms/15647499?location=Channelview%2C%20TX
6020,Spacious two story home near IAH,4,$167,Humble,"Our red brick, cool, quiet home is minutes away from Bush Intercontinental Airport. And a hop, skip away to the major freeway, 59. With a great selection of stores and restaurants around the corner. Only 15-20 minutes from Downtown Houston.","[Apollo Mission Control Center, East End Historic District, ELISSA (Bark), Lucas Gusher, Spindletop Oil Field, San Jacinto Battlefield, Space Environment Simulation Laboratory, Chambers A and B, Strand Historic District, USS TEXAS, Woodland]",https://www.airbnb.com/rooms/4227275?location=Cleveland%2C%20TX
5859,Copper Cabin,2,$200,Abilene,Copper Cabin is a relaxing get away destination located 10 miles from the Abilene Airport and the Taylor County Expo Center off highway 36. Large front and back porches welcome you to all the comforts of home away from home. Wedding venues located near the...,"[Fort Belknap, Fort Concho, Harrell Site]",https://www.airbnb.com/rooms/17773872?location=Cisco%2C%20TX
4794,Home in Denton 27 min. N of airport,1,$60,Corinth,"Want to get outside of the city after working hard or shopping hard? This is located in a Oakmont Estates, a Golf Course Community. Tennis courts, pool and Clubhouse about 20 minutes from Dallas Galleria Mall. Beautiful quiet neighborhood! Close to Lake Le...","[Dealey Plaza Historic District, Fair Park Texas Centennial Buildings, Fort Belknap, Fort Richardson, Harrell Site, Highland Park Shopping Village, Walter C. Porter Farm, Samuel T. Rayburn House]",https://www.airbnb.com/rooms/635415?location=Corinth%2C%20TX


### Consideration

The weight for the new parameters are really really "heavy" using exponential form. This is justified by the fact that we want only the top results, so if a reviews has a bad parameter it is very far from the top. In this way the top results are really similar to the query. 

Moreover we solve the problem of the equality of the top (similarity) scores of the previous step.

#### Possible develop
One could think of an optimization of the weight coefficients based on the k required

# Bonus Step: Make a nice visualization!

In [52]:
# 2 INPUT
def mapp( user_coordinates=[32.785301, -96.813712], radius=10000 ):
    #user_coordinates = [30.0201379199512, -95.2939960042513] # First b&b
    #user_coordinates = [32.785301, -96.813712] # Dallas coordinates
    #radius = 50000 = 31 miles = 50 km

    # how to decide the zoom_start of the map according to the radius
    radius_in_miles = radius/1609
    zoom_start = (radius_in_miles * 100) // 2
    #print(radius_in_miles,zoom_start)
    in_the_radius = []
    #for lat, long in lat_long_df.iterrows():

    for ind, el in lat_long_df.iterrows():
        coordinates2 = [float(el[0]),float(el[1])]
        temp_dist = distance.distance(user_coordinates, coordinates2).miles
        if temp_dist < radius_in_miles:
            in_the_radius.append(ind)

    m = folium.Map(location=user_coordinates, zoom_start=11) # add zoom start: , zoom_start=zoom_start

    folium.Marker(user_coordinates, 
                  tooltip="<b>You Are HERE</b>", 
                  icon=folium.Icon(color='green', icon='home')).add_to(m)
    
    folium.Circle(
        radius=radius,
        location=user_coordinates,
        color='#3186cc',
        fill=True,
        fill_color='#3186cc'
    ).add_to(m)


    for doc in in_the_radius:
        with open("tsv_documents/doc_" + str(doc) + ".tsv" , encoding="utf8") as csvfile:
                new_file = csv.reader(csvfile, delimiter='\t')
                row1 = next(new_file)
                coord = [float(row1[6]), float(row1[7])]
                tooltip = '<b>'+row1[8]+'</b>'
                popped = folium.Popup('<a href="'+row1[9]+'"target="_blank"> Go to the WEBSITE </a>')
        if coord == user_coordinates:
            continue

        folium.Marker(coord, popup=popped, 
                      tooltip=tooltip, 
                      icon=folium.Icon(color='red', icon='info-sign')).add_to(m)

    m.save('map.html')
    return m

In [53]:
mapp([29.85, -95], 20000)

### Consideration about the map
The Green Marker is the user location and the red marker are the nearest houses.

When you pass we the cursor on a red marker you can see the title of the review. Moreover if you tap a marker a text box opens with "Go to the WEBSITE" if you click the text it opens the AirBnB internet page