# Homework 3 - ADM

First of all, we import all the libraries required to make our code work.


In [1]:
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer 
import os
import math
from collections import Counter
import heapq
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import seaborn as sns
import folium
from folium.plugins import MarkerCluster


## Reading CSV, splitting all the rows to tsv

As required in the homework, we created .tsv documents for every row of the Airbnb DataFrame. We used the standard function writer but we had to include the encoding parameter as well.

In [2]:
counter = 0

with open('Airbnb_Texas_Rentals.csv', 'r',encoding="utf8") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row: # if row is not empty, write a file with this row
            filename = "doc_%s.tsv" % str(counter)
            with open(filename, 'w') as csvfile_out:
                writer = csv.writer(csvfile_out,delimiter="\t")
                writer.writerow(row[1:])
                counter += 1

### Text cleaning

The following task of our homework wanted us to clean the data contained in the tsv files. FIrst of all, to get the total number of documents in the folder we used listdir, a function of the os package. Then we defined functions to go through all the docs in the folder to scan and clean them. We removed punctuation, stopwords, stemmed words and we lowered case every character. 

In [3]:
DIR = '/Users/canta/Desktop/HMW3/doc'
ndocs = (len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))]))

words=[]
def cleandata(doc):
    txt = doc.read().split('\t')    
    description = txt[4]
    title = txt[7]
    description = description + " " + title    
    #we replace the new lines "tags"
    description = description.replace('\\r', ' ')
    description = description.replace('\\n', ' ')    
    #tokenize the document, strips out the punctuation
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens_d = tokenizer.tokenize(description)
    #lower case of every word
    tokens_d = [w.lower() for w in tokens_d]   
    #stemming
    ps = PorterStemmer()
    tokens_d = [ps.stem(w) for w in tokens_d]   
    #remove stopwords
    stop = set(stopwords.words('english'))
    tokens_d = [w for w in tokens_d if not w in stop]   
    #remove duplicates
    tokens_d = set(tokens_d)
    #append all words we found in words list
    for word in tokens_d:
        words.append(word)
    



In [4]:
for i in range (1,ndocs): #LATER WILL BECOME NDOCS 

    myfile = open('/Users/canta/Desktop/HMW3/doc/doc_'+str(i)+'.tsv', 'r',encoding="utf8")
    cleandata(myfile)


## Search Engine

Here we are creating a dictionary with all the words encountered in every document, so at the end we obtain a dictionary with all __different words__, paired with a numeric __ID__

In [5]:
#creating vocabulary - a dictionary with words as keys and integers as IDs

vocabulary = {}
count = 1
for word in words:
    vocabulary[word] = count
    count += 1

## Inverted Index dictionary

The Inverted Index Dictionary allows us to know in which document(s) a word appears, given its ID. The docs dictionary is computed here and it will be used in the next steps.

In [6]:
#INVERTED INDEX and docs dictionary: a nested dictionary to retrieve the frequencies for every word in each doc

invIndex = {}
docs = {}

for i in range (1,ndocs): 

    myfile = open('/Users/canta/Desktop/HMW3/doc/doc_'+str(i)+'.tsv', 'r',encoding="utf8")
    filename = os.path.basename((myfile.name).replace('.tsv', ''))
    txt = myfile.read().split('\t')    
    description = txt[4]
    title = txt[7]
    
    #merge description and title
    description = description + " " + title    
    description = description.replace('\\r', ' ')
    description = description.replace('\\n', ' ')
    
    #strip punctuation out
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens_inv = tokenizer.tokenize(description)
    
    #lower case 
    tokens_inv = [w.lower() for w in tokens_inv]
   
    #stemming
    ps = PorterStemmer()
    tokens_inv = [ps.stem(w) for w in tokens_inv]
    
    #stopwords
    stop = set(stopwords.words('english'))
    tokens_inv = [w for w in tokens_inv if not w in stop]
    
    #computed here, the length doesn't take into account the stopwords and the punctuation, it includes repetitions
    nparole = len(tokens_inv) 
    
    #the counter allows us to create a dict with every word as a key and the number of times it appears in the doc
    # as value
    c = Counter(tokens_inv)
    
    c = dict(c)
    #to compute easily the tf for each doc we put into the dictionary a key containing the lenght
    c['LUNGHEZZA2018'] =  nparole
    docs[filename] = c
    
    tokens_inv = set(tokens_inv)
  
    #mechanism to build the inverted index dict
    for word in tokens_inv:
        
        
        #for every word: if it is the first time it appears, create a new key in the dict and append the name of
        # the document in which it is present. if it's already in the dict, append the name of the doc
        if(word in vocabulary.keys()):
            
            if (vocabulary[word]) in invIndex:
                invIndex[vocabulary[word]].append(filename)
            else:
                invIndex[vocabulary[word]] = [filename]
            
    
    

## Query

In [18]:
#reading the query as input
query = str(input())


house with garden dallas


In [19]:
#retrieve the word IDs of the query

#manipulate the text of the query: stemming, lower case, strip punctuation and stopwords, remove duplicates

#text manipulation
query = query.split()
ps = PorterStemmer()
query = [ps.stem(w) for w in query]
query = [w.lower() for w in query]
stop = set(stopwords.words('english'))
query = [w for w in query if not w in stop]
query = set(query)

#look at vocabulary dict: from the key of the dict (word) we get the IDs
ids = []
for word in query:
    if word in vocabulary.keys():
        ids.append(vocabulary[word])



In [20]:
#we have the IDs, now let's get the list of documents in which they're in
querydict = {}
for id in ids:
    querydict[id] = invIndex[id]
    

In [21]:
#we need only the docs containing all the words, so the point here is to make the intersection between all the lists
#of docs

querylist = []

#querydict is a kind of little invIndex, it has the same informations, but just for the docs resulted from the query
for key in querydict.keys():
    querylist.append(querydict[key])

#intersection
result = set(querylist[0]).intersection(*querylist)

In [22]:
print(len(result)) #19 total results

19


## Cleaning our data
Since we noticed we have duplicates in our data we decided to get rid of them in the output. Unfortunately, same documents with same description,title and city appears different because of the different link(that actually links to the same web page). We just compare every document description and if the same description already appears we ignore the document. Here is our funciton defined:

In [23]:
def cleanres(result):
    goodres=[]
    urllist=[]
    for each in result:
                myfiles=open('/Users/canta/Desktop/HMW3/doc/'+str(each)+'.tsv','r', encoding="utf8")
                txt = myfiles.read().split('\t')                
                if txt[7] not in urllist: 
                    goodres.append(each)
                    urllist.append(txt[7])
                
    return goodres

In [24]:
result=cleanres(result)

In [25]:
print(len(result)) #some  documents had the same descriptions so we cut them down


11


## Showing the results of the query

In [26]:
#SHOWING THE RESULTS OF THE QUERY
def showres(result):
    title=[]
    descr=[]
    city=[]
    url=[]
    #for each document resulted : open it - append the informations on different lists - show them on a table
    for each in result:
        myfiles=open('/Users/canta/Desktop/HMW3/doc/'+str(each)+'.tsv','r', encoding="utf8")
        txt = myfiles.read().split('\t')
        title.append(txt[7])
        descr.append(txt[4])
        city.append(txt[2])
        url.append(txt[8])

    data_tuples = list(zip(title,descr,city,url))
    table=pd.DataFrame(data_tuples,columns = ["Title", "Description", "City", "Link"])
    pd.set_option('display.max_colwidth', -1)
    return table

In [27]:
table=showres(result)
table

Unnamed: 0,Title,Description,City,Link
0,The Boat House,"Gated Estate in the heart of the DFW metroplex. We are 8-10 minutes away from DFW airport. 30 minutes away from Love Field Airport. Located within minutes from Dallas &amp; Fort Worth, American Airlines Center, AT&amp;T Stadium, Globe Life Park, Gaylord, Great Wolf Lodge, Grapevine Mills, Historic Grapevine, Texas Motor Speedway, DFW museums, Deep Elum, FW Stock Yards, Dallas Arboretum, FW Bass Hall, Dallas Aquarium, FW Japanese Gardens, Dallas Zoo, FW Zoo. Billy Bob's. DFW Lakes. A lot!! To Do",Euless,https://www.airbnb.com/rooms/18932564?location=Colleyville%2C%20TX\n\n
1,Zen & the Art of the Ranch House,"A much-loved family home nestled in a serene garden and located within a short drive to DFW Intl' Airport, Las Colinas, downtown Dallas, Arlington's AT&amp;T Stadium and Lone Star Park. It's ideal for family gatherings, corporate stays or concerts/sports games.",Irving,https://www.airbnb.com/rooms/11502964?location=Coppell%2C%20TX\n\n
2,STUNNING California inspired modern beach townhome,"This one of a kind 1100 sqft townhouse 2 bedroom that is located in a hidden gem center of Dallas. Indulge yourself with entertainment inside and out. Amazingly bright modern barn and beach house feel. The design was inspired by a beach house off Malibu. Not to mention the bamboo garden retreat, patio with fire pit, tranquil hot tub. Many surprises await! \n\nNOTE: We only are allowing single families as our HOA rules are restricted. However traveling friends with a family is accepted.",Dallas,https://www.airbnb.com/rooms/15510582?location=Addison%2C%20TX\n\n
3,14 minutes to Downtown/Fair Park!,"My place is close to The Dallas Arboretum and Botanical Gardens, and White Rock Lake. It only takes 14 minutes to get to Downtown Dallas/Fair Park. You’ll love my place because of the coziness, access to fully loaded kitchen, comfy queen size bed, and quick highway access. Dallas is a city that you need a car. There is plenty of free parking in front of my house. I have another room in this house that I have been Airbnbing since May 2016",Dallas,https://www.airbnb.com/rooms/15708430?location=Balch%20Springs%2C%20TX\n\n
4,Secluded Treasure in Heart of City,"Escape the bustle of Dallas to your own private Guest House located in a garden setting! Unwind poolside or relax in the hot tub. Guest House includes queen bed, full bath, kitchenette, continental breakfast, WiFi. Country in the heart of the City!\n\nSee our other posting \",Dallas,https://www.airbnb.com/rooms/7047741?location=Brazos%20River%2C%20TX\n\n
5,Lakewood Private BD/BA/Study,"You will enjoy a large, clean, charming upstairs bedroom (conveniently separate from my room which is on the opposite side of the house) with attached full bathroom, spacious walk-in closet and your own private study/office in vibrant Lakewood Neighborhood walking distance from White Rock Lake and The Dallas Arboretum and Botanical Gardens. Bedroom includes queen size bed, Egyptian cotton sheets, flatscreen TV with cable, WiFi, fan, adjustable heat and AC, and mini refrigerator.",Dallas,https://www.airbnb.com/rooms/17251254?location=Arlington%2C%20TX\n\n
6,"Little Bear Bungalow, private cottage near lake","Little Bear Bungalow in the heart of East Dallas' Lake &amp; Garden District. 10 minutes to downtown, Baylor Hospital, &amp; SMU. Private entrance, chemical &amp; pet-free stand-alone building w/ fridge, TV, microwave, wifi, 2 bikes. Couples, solo adventure,business traveler. NO CHILDREN. Within a mile find shopping, White Rock Lake (Katy Trail access) Dallas Arboretum, Farmers Market, local restaurants, bars &amp; bistros. Several theater companies &amp; art gallery at the Bath House on the lake. DART handy.",Dallas,https://www.airbnb.com/rooms/13189308?location=Balch%20Springs%2C%20TX\n\n
7,Alla's Garden House in S.W. Dallas.,"Welcome to Alla's Garden House! \nBeautiful private Garden House 3bedrooms, 2bathrooms (sleep 8 -10 people located only 15min away from downtown Dallas, American Airline Center, 7min from Horse Race Arena Grand Prairie, 20min Dallas Cowboys Stadium.",Duncanville,https://www.airbnb.com/rooms/286328?location=Cedar%20Hill%2C%20TX\n\n
8,Waterpark in Dallas,"Backyard paradise close to Plano. I have created a small backyard oasis with a waterpark, koi pond and a botanical garden feel. The house has a three story elevation with an all white and beautiful glass feel. I love my home and would like to share it with you.",Dallas,https://www.airbnb.com/rooms/19096810?location=Addison%2C%20TX\n\n
9,Alla's Rose Room in S.W. Dallas,"Beautifully furnished room with an antique King size canopy bed, private bath, and other fine furnishings TV w/ DVD player in the room. \n Private bathroom. The Garden House, Rose Room, located only 3/10's of a mile from the main bed &amp; breakfast",Duncanville,https://www.airbnb.com/rooms/714222?location=Cedar%20Hill%2C%20TX\n\n


# tfIdf Inverted Index

In [28]:
#NEW Inverted Index, includes tfIdf scores

#we'll have a dictionary as the first invIndex, but with informations about the score of that word in the doc_i
tfidfIdx = {}

for key in invIndex: #here key is the wordID - so: for every word
    
    idf_den = len(invIndex[key]) #n° of different docs containing the word
    
    realWord = (list(vocabulary.keys())[list(vocabulary.values()).index(key)]) #getting word from its ID
    
    for each in invIndex[key]: #each is the doc_i, for the word, loop through all documents in which it appears
        
        #accessing docs dict by two keys: name of doc and word. get the count and divide it by doc length
        tf = docs[each][realWord]/(docs[each] ['LUNGHEZZA2018']) 
        tfidf = tf * math.log10(ndocs/idf_den)
        
        if(key in tfidfIdx.keys()):
            tfidfIdx[key].append((each, tfidf))
        else:
            tfidfIdx[key] = [(each, tfidf)]
            
        

# Cosine similarity scoring

We had to compute the cosine similarity between al the documents. In order to do that we compary every document in result(documents in which **all** the words of the query are contained) with the query. We used the following formula:

$$cosinesimilarity=\frac{dot product of the document and the query}{product of the norms of the document and the query}$$

It is important to say that we took into account the following quantities to calculate the cosine similarity. Term frequencies:

$$tf=\frac{total times of that word in the doc}{number of total words}$$

And inverse document frequency:

$$idf=log\frac{total number of docs}{number of docs with theword in it}$$



In [29]:
#shortcuts to compute the value inside the function below

def eucl_norm(a):
    return math.sqrt(np.dot(a, a))

def cosine_similarity(a, b, c):
    return np.dot(a,b) / (eucl_norm(c) * eucl_norm(b))


In [30]:
def cosine(doc):
    DOCUMENTO = str(doc)
    myfile = open('/Users/canta/Desktop/HMW3/doc/'+str(doc)+'.tsv', 'r',encoding="utf8")
    txt = myfile.read().split('\t')
    #text manipulation process for the doc we're opening
    description = txt[4]
    title = txt[7]
    description = description + " " + title
    description = description.replace('\\r', ' ')
    description = description.replace('\\n', ' ')
    
    tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens_d =tokenizer.tokenize(description)
    tokens_d = [w.lower() for w in tokens_d]
    ps = PorterStemmer()
    tokens_d = [ps.stem(w) for w in tokens_d]
    stop = set(stopwords.words('english'))
    tokens_d = [w for w in tokens_d if not w in stop]
    tokens_d = set(tokens_d)
    
    #entire document
    docwordId = [] #all word IDs of document

    for word in tokens_d: #word in tokens_d is the actual word in the doc
        docwordId.append(vocabulary[word]) #we look into vocabulary dict to get the IDs   
    
    #we are getting tfidf scores from tfidfIdx for every word id in the doc    
    docsQuery = []
    scores = [] #vector of tfidf scores for all words in the doc:   ||d||
    for idx in docwordId: #for every ID - always all words in doc
        for each in tfidfIdx[idx]: #give me its score, given that we're analyzing one document per time
            if(each[0] == DOCUMENTO):
                scores.append(each[1]) #each[1] is the score of that word in the doc
                if(idx in ids): #
                    docsQuery.append(each[1])
                
    #query tfidf  |q| and ||q||
    querytfidf = []
    for ID in ids:
        idf_den = len(invIndex[ID])
        tf = 1/len(ids)
        idf = math.log10(ndocs/idf_den)
        querytfidf.append(tf*idf)
    
    
    a = cosine_similarity(docsQuery,querytfidf, scores)
    return (a,doc)
    
    

## Heap Data Structure

According to the homework requests, we used an [heap data structure](https://www.geeksforgeeks.org/heap-data-structure/) to handle the documents resulting. 

In [31]:
H = []
K = 10  #LENGTH WE HAVE DECIDED (we'll show 10 documents max)
heapq.heapify(H)
for each in result:
    if len(H) < K : #append until we reach k results
        heapq.heappush(H,cosine(each))
    elif H[0] < cosine(each): 
        #once we reach 10 results, compare the smallest element of the heap with the cosine score of doc we're 
        #analysing, if it's bigger, then put it in the heap (heapreplace always replace the smallest element)
        heapq.heapreplace(H,cosine(each))
# Use heapify to rearrange the elements
docli=[]
punt=[]
heapq.heapify(H) 
for each in H:
    x,y=each
    docli.append(y)
    punt.append(x)
    
   


table = showres(docli)
table['Scores'] = punt
table.sort_values("Scores",ascending=False)

Unnamed: 0,Title,Description,City,Link,Scores
7,Alla's Garden House in S.W. Dallas.,"Welcome to Alla's Garden House! \nBeautiful private Garden House 3bedrooms, 2bathrooms (sleep 8 -10 people located only 15min away from downtown Dallas, American Airline Center, 7min from Horse Race Arena Grand Prairie, 20min Dallas Cowboys Stadium.",Duncanville,https://www.airbnb.com/rooms/286328?location=Cedar%20Hill%2C%20TX\n\n,0.411954
8,14 minutes to Downtown/Fair Park!,"My place is close to The Dallas Arboretum and Botanical Gardens, and White Rock Lake. It only takes 14 minutes to get to Downtown Dallas/Fair Park. You’ll love my place because of the coziness, access to fully loaded kitchen, comfy queen size bed, and quick highway access. Dallas is a city that you need a car. There is plenty of free parking in front of my house. I have another room in this house that I have been Airbnbing since May 2016",Dallas,https://www.airbnb.com/rooms/15708430?location=Balch%20Springs%2C%20TX\n\n,0.285221
9,Secluded Treasure in Heart of City,"Escape the bustle of Dallas to your own private Guest House located in a garden setting! Unwind poolside or relax in the hot tub. Guest House includes queen bed, full bath, kitchenette, continental breakfast, WiFi. Country in the heart of the City!\n\nSee our other posting \",Dallas,https://www.airbnb.com/rooms/7047741?location=Brazos%20River%2C%20TX\n\n,0.238099
3,Zen & the Art of the Ranch House,"A much-loved family home nestled in a serene garden and located within a short drive to DFW Intl' Airport, Las Colinas, downtown Dallas, Arlington's AT&amp;T Stadium and Lone Star Park. It's ideal for family gatherings, corporate stays or concerts/sports games.",Irving,https://www.airbnb.com/rooms/11502964?location=Coppell%2C%20TX\n\n,0.203616
5,The Boat House,"Gated Estate in the heart of the DFW metroplex. We are 8-10 minutes away from DFW airport. 30 minutes away from Love Field Airport. Located within minutes from Dallas &amp; Fort Worth, American Airlines Center, AT&amp;T Stadium, Globe Life Park, Gaylord, Great Wolf Lodge, Grapevine Mills, Historic Grapevine, Texas Motor Speedway, DFW museums, Deep Elum, FW Stock Yards, Dallas Arboretum, FW Bass Hall, Dallas Aquarium, FW Japanese Gardens, Dallas Zoo, FW Zoo. Billy Bob's. DFW Lakes. A lot!! To Do",Euless,https://www.airbnb.com/rooms/18932564?location=Colleyville%2C%20TX\n\n,0.201015
4,Alla's Rose Room in S.W. Dallas,"Beautifully furnished room with an antique King size canopy bed, private bath, and other fine furnishings TV w/ DVD player in the room. \n Private bathroom. The Garden House, Rose Room, located only 3/10's of a mile from the main bed &amp; breakfast",Duncanville,https://www.airbnb.com/rooms/714222?location=Cedar%20Hill%2C%20TX\n\n,0.196801
1,Waterpark in Dallas,"Backyard paradise close to Plano. I have created a small backyard oasis with a waterpark, koi pond and a botanical garden feel. The house has a three story elevation with an all white and beautiful glass feel. I love my home and would like to share it with you.",Dallas,https://www.airbnb.com/rooms/19096810?location=Addison%2C%20TX\n\n,0.190903
6,"Little Bear Bungalow, private cottage near lake","Little Bear Bungalow in the heart of East Dallas' Lake &amp; Garden District. 10 minutes to downtown, Baylor Hospital, &amp; SMU. Private entrance, chemical &amp; pet-free stand-alone building w/ fridge, TV, microwave, wifi, 2 bikes. Couples, solo adventure,business traveler. NO CHILDREN. Within a mile find shopping, White Rock Lake (Katy Trail access) Dallas Arboretum, Farmers Market, local restaurants, bars &amp; bistros. Several theater companies &amp; art gallery at the Bath House on the lake. DART handy.",Dallas,https://www.airbnb.com/rooms/13189308?location=Balch%20Springs%2C%20TX\n\n,0.162328
2,Dallas Downton Abbey,"Our Dallas Downton Abbey is close to downtown, Southern Methodist University, George W. Bush Library, the beautiful shops at Northpark and Highland Park Village, The Perot Museum, and Love field Airport. You’ll love this gorgeous retreat in the city because it is elegant, safe, clean, and bright, with lush gardens and tree views and exquisite craftsmanship, it includes a full home spa with a Vichy shower, three steam rooms, with a large salt water pool and hot tub, and a full time house manager.",Dallas,https://www.airbnb.com/rooms/14254297?location=Addison%2C%20TX\n\n,0.148958
0,Lakewood Private BD/BA/Study,"You will enjoy a large, clean, charming upstairs bedroom (conveniently separate from my room which is on the opposite side of the house) with attached full bathroom, spacious walk-in closet and your own private study/office in vibrant Lakewood Neighborhood walking distance from White Rock Lake and The Dallas Arboretum and Botanical Gardens. Bedroom includes queen size bed, Egyptian cotton sheets, flatscreen TV with cable, WiFi, fan, adjustable heat and AC, and mini refrigerator.",Dallas,https://www.airbnb.com/rooms/17251254?location=Arlington%2C%20TX\n\n,0.142758


## DEFINE A NEW SCORING FUNCTION


We have defined three new functions: one for the distance index, one for the price index and one for final new scoring function(plus one nice graph to show the distribution for the prices per bedrooms). 

## definedistance

So, the starting point is our set **result**. This set contains all the documents in which the query is contained. We decide to compute the distance of every documents from the city centre of its own city centre. We have done that usign the Geopymodule. We have also normalized our values by dividing by the maxium distance found in one of the documents of result. Here is the formula:

$$\frac{distance of the single document}{maximum distance of one document in result}$$

The outuput of this formula is a list with all the distances normalized.

## defprice

Using the same starting point(result=set containing all the documents in which the query is contained), we decided to use the variables 'average_rate_per_night' and 'bed_room_count' to calculate our final scoring function. First of all, we computed the 'price per bedroom' variable with:

$$listpricesperbed=\frac{averageratepernight}{bedroomcount}$$

We assigned 'bed_room_count'==0 if its values was 'Studio'. At this point we decided to use the quantile distribution of this new variable we have created and we opted to boost all the documents appearing in the second quartile(Practically speaking, the ones that were below the median(low price per bedroom). We used this kind of weighting process:

 -listpricesperbed[doc]*0.25 if it was contained in the **first quartile**
 
 -listpricesperbed[doc]*0.40 if it was contained in the **second quartile**

 -listpricesperbed[doc]*0.25 if it was contained in the **third quartile**

 -listpricesperbed[doc]*0.10 if it was contained in the **fourth quartile**
 
Obviously the sum of the weights equals to 1. And finally, we divided each element by the maximum contained in that list.
 
 This fucntion returns a list of prices per bedrooms with all the weighting process applied to it. 
 
 ## def scoringfunction
 
 As our last function we created the new final scoring function. This takes in every document in result with its distance and its price(given by the previous functions). We used this formula to obtain our final scoring function:
 
 $$newscore==(Weightforprice)*(1-pricesperbed)+(weightfordistance)(1-distance)$$
 
 THe weights for both the distance and the price can be decided arbitrarily and we decided to weight more the distance than the prices. The correspondant weights are: 
 $$Weightforprice==0.40$$
 
 $$Weightforprice==0.60$$
 
 This function the returns a value to rank our documents with and the name of the documents which refers to.
 
 
 
 
 **P.S.**: We know we could have written and called everything in a single function but we decided to split the funcitons in order to make everything more understandable (Hopefully we did succeed).

In [32]:
def definedistance(result):
    distances=[]
    for each in result:
        #We import the files and we save city and coordinates in order to compute the distances
        myfiles=open('/Users/canta/Desktop/HMW3/doc/' +str(each)+'.tsv','r', encoding="utf8")
        doc = myfiles.read().split("\t")
        city =doc[2].lower()
        long=doc[6]
        lat=doc[5]
        geolocator = Nominatim(user_agent="mb") 
        #we use geolocator to compute distances
        locationQ = ((geolocator.geocode(city)).latitude, (geolocator.geocode(city)).longitude) 
        loc = (lat,long) #coordinates retrieved by the doc
        distance = geodesic(locationQ, loc).miles
        distances.append(distance)
    #we normalize this quantity by dividing by the max distace found
    mass=max(distances)
    finaldist = [x / mass for x in distances]
    return (list(finaldist))



In [33]:
def defprice(result):
    listpricesperbed =[]
    for each in result:
        #we import the files needed and we save price and bedroom in order to compute our new index.
        myfiles=open('/Users/canta/Desktop/HMW3/doc/'+str(each)+'.tsv','r', encoding="utf8")
        doc = myfiles.read().split("\t")
        price=doc[0]
        bedrooms=(doc[1])
        price=price.replace("$","") #we replace dollar sign in price
        if(bedrooms=='Studio'):
            listpricesperbed.append(0) #automatically assign 0 if an apartment has no bedroom
            continue          
        listpricesperbed.append(int(price)/int(bedrooms))
    global finallist
    #we transform the list in an array in order touse percentile function and assign new scores
    finallist=np.array(listpricesperbed)   
    firstp=np.percentile(finallist,25)  #First quantile
    secondp=np.percentile(finallist,50) #second quantile 
    thirdp=np.percentile(finallist,75) #third quantile
    fourthp=np.percentile(finallist,100) #fourth quantile
    #here we assign different scores depending on which quantiles we are into. 
#   -listpricesperbed[doc]*0.25 if it was contained in the first quartile
#   -listpricesperbed[doc]*0.40 if it was contained in the second quartile
#   -listpricesperbed[doc]*0.25 if it was contained in the third quartile
#   -listpricesperbed[doc]*0.10 if it was contained in the fourth quartile
    for i in range(len(finallist)):
        if finallist[i]<firstp:
            finallist[i]=finallist[i]*0.25
        elif finallist[i]<=secondp and finallist[i]>firstp :
            finallist[i]=finallist[i]*0.40
        elif finallist[i]<=thirdp and finallist[i]>secondp :
            finallist[i]=finallist[i]*0.25
        elif finallist[i]<=fourthp and finallist[i]>thirdp:
            finallist[i]=finallist[i]*0.10 
    #we normalize this quantity by dividing by the max price found
    lista=finallist/(max(finallist))
    return(list(lista))


In [34]:
def scoringfunction(doc,distanza,price):
    #new scoring function based on what we defined before.
    newscore=(0.40*(1-price))+(0.60*(1-distanza))
    return (newscore,doc)

In [35]:
distan = definedistance(result)  
pricesperbeds = defprice(result)
H = []
K=10  #LENGTH WE HAVE DECIDED 
heapq.heapify(H)
i=0
for each in result:
    if len(H)<K:
        heapq.heappush(H,scoringfunction(each,distan[i],pricesperbeds[i]))
    elif H[0] < scoringfunction(each,distan[i],pricesperbeds[i]): 
        #once we reach 10 results, compare the smallest element of the heap with the cosine score of doc we're 
        #analysing, if it's bigger, then put it in the heap (heapreplace always replace the smallest element)
        heapq.heapreplace(H,scoringfunction(each,distan[i],pricesperbeds[i]))
    i+=1
# Use heapify to rearrange the elements
docli=[]
punt=[]
heapq.heapify(H) 
for each in H:
    x,y=each
    docli.append(y)
    punt.append(x)
    
   

table=showres(docli)
table['New Scores']=punt
table.sort_values("New Scores",ascending=False)

Unnamed: 0,Title,Description,City,Link,New Scores
7,Alla's Garden House in S.W. Dallas.,"Welcome to Alla's Garden House! \nBeautiful private Garden House 3bedrooms, 2bathrooms (sleep 8 -10 people located only 15min away from downtown Dallas, American Airline Center, 7min from Horse Race Arena Grand Prairie, 20min Dallas Cowboys Stadium.",Duncanville,https://www.airbnb.com/rooms/286328?location=Cedar%20Hill%2C%20TX\n\n,0.914135
8,Zen & the Art of the Ranch House,"A much-loved family home nestled in a serene garden and located within a short drive to DFW Intl' Airport, Las Colinas, downtown Dallas, Arlington's AT&amp;T Stadium and Lone Star Park. It's ideal for family gatherings, corporate stays or concerts/sports games.",Irving,https://www.airbnb.com/rooms/11502964?location=Coppell%2C%20TX\n\n,0.852574
5,The Boat House,"Gated Estate in the heart of the DFW metroplex. We are 8-10 minutes away from DFW airport. 30 minutes away from Love Field Airport. Located within minutes from Dallas &amp; Fort Worth, American Airlines Center, AT&amp;T Stadium, Globe Life Park, Gaylord, Great Wolf Lodge, Grapevine Mills, Historic Grapevine, Texas Motor Speedway, DFW museums, Deep Elum, FW Stock Yards, Dallas Arboretum, FW Bass Hall, Dallas Aquarium, FW Japanese Gardens, Dallas Zoo, FW Zoo. Billy Bob's. DFW Lakes. A lot!! To Do",Euless,https://www.airbnb.com/rooms/18932564?location=Colleyville%2C%20TX\n\n,0.838822
9,Secluded Treasure in Heart of City,"Escape the bustle of Dallas to your own private Guest House located in a garden setting! Unwind poolside or relax in the hot tub. Guest House includes queen bed, full bath, kitchenette, continental breakfast, WiFi. Country in the heart of the City!\n\nSee our other posting \",Dallas,https://www.airbnb.com/rooms/7047741?location=Brazos%20River%2C%20TX\n\n,0.795215
4,Alla's Rose Room in S.W. Dallas,"Beautifully furnished room with an antique King size canopy bed, private bath, and other fine furnishings TV w/ DVD player in the room. \n Private bathroom. The Garden House, Rose Room, located only 3/10's of a mile from the main bed &amp; breakfast",Duncanville,https://www.airbnb.com/rooms/714222?location=Cedar%20Hill%2C%20TX\n\n,0.790375
6,Lakewood Private BD/BA/Study,"You will enjoy a large, clean, charming upstairs bedroom (conveniently separate from my room which is on the opposite side of the house) with attached full bathroom, spacious walk-in closet and your own private study/office in vibrant Lakewood Neighborhood walking distance from White Rock Lake and The Dallas Arboretum and Botanical Gardens. Bedroom includes queen size bed, Egyptian cotton sheets, flatscreen TV with cable, WiFi, fan, adjustable heat and AC, and mini refrigerator.",Dallas,https://www.airbnb.com/rooms/17251254?location=Arlington%2C%20TX\n\n,0.670665
3,14 minutes to Downtown/Fair Park!,"My place is close to The Dallas Arboretum and Botanical Gardens, and White Rock Lake. It only takes 14 minutes to get to Downtown Dallas/Fair Park. You’ll love my place because of the coziness, access to fully loaded kitchen, comfy queen size bed, and quick highway access. Dallas is a city that you need a car. There is plenty of free parking in front of my house. I have another room in this house that I have been Airbnbing since May 2016",Dallas,https://www.airbnb.com/rooms/15708430?location=Balch%20Springs%2C%20TX\n\n,0.667929
2,"Little Bear Bungalow, private cottage near lake","Little Bear Bungalow in the heart of East Dallas' Lake &amp; Garden District. 10 minutes to downtown, Baylor Hospital, &amp; SMU. Private entrance, chemical &amp; pet-free stand-alone building w/ fridge, TV, microwave, wifi, 2 bikes. Couples, solo adventure,business traveler. NO CHILDREN. Within a mile find shopping, White Rock Lake (Katy Trail access) Dallas Arboretum, Farmers Market, local restaurants, bars &amp; bistros. Several theater companies &amp; art gallery at the Bath House on the lake. DART handy.",Dallas,https://www.airbnb.com/rooms/13189308?location=Balch%20Springs%2C%20TX\n\n,0.584124
1,STUNNING California inspired modern beach townhome,"This one of a kind 1100 sqft townhouse 2 bedroom that is located in a hidden gem center of Dallas. Indulge yourself with entertainment inside and out. Amazingly bright modern barn and beach house feel. The design was inspired by a beach house off Malibu. Not to mention the bamboo garden retreat, patio with fire pit, tranquil hot tub. Many surprises await! \n\nNOTE: We only are allowing single families as our HOA rules are restricted. However traveling friends with a family is accepted.",Dallas,https://www.airbnb.com/rooms/15510582?location=Addison%2C%20TX\n\n,0.440475
0,Dallas Downton Abbey,"Our Dallas Downton Abbey is close to downtown, Southern Methodist University, George W. Bush Library, the beautiful shops at Northpark and Highland Park Village, The Perot Museum, and Love field Airport. You’ll love this gorgeous retreat in the city because it is elegant, safe, clean, and bright, with lush gardens and tree views and exquisite craftsmanship, it includes a full home spa with a Vichy shower, three steam rooms, with a large salt water pool and hot tub, and a full time house manager.",Dallas,https://www.airbnb.com/rooms/14254297?location=Addison%2C%20TX\n\n,0.31615


Just a rapid comment, Alla's Garden House in S.W. Dallas is the best house because with our weighting system it's the **best** option for low price and small distance from the city centre.

On the other hand Dallas Downtown Abbeyappears to be the **worst** one

## MAP


In [None]:
os.chdir(r'/Users/canta/Desktop/HMW3/')
df = pd.read_csv("Airbnb_Texas_Rentals.csv")
info = pd.DataFrame()
info["latitude"] = df["latitude"]
info["longitude"] = df["longitude"]
info["url"] = df["url"]
info.head()

In [None]:
def map_func():
    
    geolocator = Nominatim(user_agent="Texas")
    
    text = input("Please enter either the 'place' or the 'coordinates' of your choice [place/coordinates]: ")
    
    
    
    if text == "place":
        place = input("Type the name of the place you are searching for: ")
        rad = int(input("Enter the distance in km: "))
        location = geolocator.geocode(place) 
        #This gives the coordinates of the place
        loc_input = (location.latitude, location.longitude)
    elif text == "coordinates":
        lat = float(input("Enter the latitude: "))
        long = float(input("Enter the longitute: "))
        loc_input = (lat, long)
        #This gives the address of the coordinates
        location = geolocator.reverse(loc_input) 
        print("The coordinates you entered belongs to this address: " + location.address) 

        rad = int(input("Enter the distance in km: "))
        
    else:
        return print("Incorrect input! Enter only 'place' or 'coordinates'") 
    
    tooltip = "Click here"
    #Creating main map
    main_map = folium.Map( 
    location = loc_input,
    zoom_start = 12
    )
    rad = rad
    #creating s circle within the prescribed distance
    folium.Circle( 
    location = loc_input, 
    radius = rad,
    color = '#3186cc',
    fill = True,
    fill_color = '#3186cc'
    #fill_opacity: 0.6
    ).add_to(main_map)

    #Addning markers
    folium.Marker(location = loc_input, icon = folium.Icon(color='red')).add_to(main_map) 
    
    #Create clusters of the points
    mark_cluster = MarkerCluster().add_to(main_map)
    
    #Creating markers for each location in dataframe
    for i, x in info.iterrows():
        point = (x["latitude"], x["longitude"])
        geography = geodesic(point, loc_input).meters
        
        
        if geography <= rad:
            folium.Marker(location = (x["latitude"], x["longitude"]),
                          popup = folium.Popup('<a href="'+x["url"]+'"'+'target="_blank"> [Click the url for more information] </a>'),
                          tooltip = tooltip
                         ).add_to(mark_cluster)
    
    return main_map

In [None]:
#Map avalaible in the readme file on GitHub
map_func()

