# Homework 3 - Find the perfect place to stay in Texas!

Here we download all the necessary packages

In [2]:
import json
import csv
import nltk
import string
import re
import math
import nltk.stem as stemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
from heapq import nlargest
from datetime import datetime
from IPython.display import display, HTML
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/francois/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/francois/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/francois/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Here we convert csv file to tsv

In [3]:
fcsv = open('Airbnb_Texas_Rentals.csv','r', errors='ignore')
ftsv = open('Airbnb_Texas_Rentals.tsv','w', errors='ignore')
with fcsv as csvin, ftsv as tsvout:
    csvin = csv.reader(csvin)
    tsvout = csv.writer(tsvout, delimiter='\t')

    for row in csvin:
        tsvout.writerow(row)

This function below preprocess data by deleting all the stopwords and punctuations

In [4]:
def tokme(query):
    tokens = nltk.word_tokenize(query)
    tokens = [i.lower() for i in tokens if ( i not in string.punctuation and i not in stop_words)]
    tokens = [i.replace(" ", "") for i in tokens]
    return tokens

# Search Engine 1

***Conjunctive query***

In [5]:
#This function creates dictionary where store the whole set of preprocessed words used 
#in listings as keys and docs where they are used as values
#It is our inverted index
def createWordDict(row, doc_name, wordDict):
    row = row[5]+row[8]
    replaced = re.sub(r'[\/\.]', ' ', row.replace("\\n"," "))
    row = list(set(tokme(replaced)))
    for word in row:
        if word in wordDict :
            wordDict[word].append(doc_name)
        else:
            wordDict[word] = [doc_name]
    return wordDict

In [6]:
#This function create TSV file for each row
def createTSV(row, doc_name):
    tsvout = open('Docs/'+doc_name,'w', errors='ignore')
    tsvout = csv.writer(tsvout, delimiter='\t')
    tsvout.writerow(row)

In [7]:
#Using this function we save inverted index
#Also we create a vocabulary for all the words used in listings
def saveVoc(wordDict):
    with open('wordDict.json', 'w') as fp:
        json.dump(wordDict, fp)
    with open('vocabulary.csv', 'w') as fv:
        wr = csv.writer(fv, quoting=csv.QUOTE_ALL)
        vocabulary = wordDict.keys()
        wr.writerow(vocabulary)

In [8]:
#This function preprocesses the query
def getQuery(query):
    listQuery = tokme(query)
    return(listQuery)

In [9]:
#Using this function we create a final list of the documents.
#This will be an output of the first engine
def searchQuery(query, wordDict):
    result = []
    wordsNotFound = []
    # case word not in dict
    for word in query:
        if word not in wordDict:
            wordsNotFound.append(word)
            continue
        if result != []:
            result = list(set(wordDict[word]) & set(result))
        else:
            result = wordDict[word]
    return(result, wordsNotFound)

In [10]:
#Using this function we print the output of our query
def printResult(results, wordsNotFound):
    for r in results:
        df=pd.read_csv('Docs/'+r, sep='\t', usecols=[3, 5, 8, 9], header=None)
        print(df)
    if wordsNotFound != []:
        print ("Words that the documents don't contain : " + ", ".join(wordsNotFound))

In [11]:
# This function is a main function where we preprocess the data, create an inverted index and save it.
def preprocessingData():
    fcsv = open('Airbnb_Texas_Rentals.csv','r',errors='ignore')
    wordDict = {}
    with fcsv as csvin:
        csvin = csv.reader(csvin)
        for i,row in enumerate(csvin):
            doc_name = "doc_"+str(i)+".tsv"
            createTSV(row, doc_name)
            wordDict = createWordDict(row, doc_name, wordDict)      
    saveVoc(wordDict)

In [12]:
# READS INVERTED INDEX FROM JSON FILE
def loadJSONDict():
    json1_file = open('wordDict.json')
    json1_str = json1_file.read()
    json1_data = json.loads(json1_str)
    return json1_data

In [13]:
# THIS IS THE FIRST SEARCH ENGINE.
# HERE WE READ the inverted index from JSON file
# INSERT QUERY
# AND PRINT THE RESULT OF THE SEARCH
def SearchEngine(query):
    wordDict = loadJSONDict()
    #print(wordDict)
    query = getQuery(query)
    results, wordsNotFound = searchQuery(query, wordDict)
    printResult(results, wordsNotFound)

In [14]:
preprocessingData()

In [None]:
query = input()
SearchEngine(query)

# Search Engine 2

***Conjunctive query & Ranking score***

In [15]:
def createTSV2(row, doc_name):
    """Save our row (apartment) in a tsv file """
    tsvout = open('Docs/'+doc_name,'w', errors='ignore')
    tsvout = csv.writer(tsvout, delimiter='\t')
    tsvout.writerow(row)

In [16]:
def createWordDict2(row, doc_name, wordDict, distDocDict):
    """Fill the inverted index and the dictionary of distance of every documents
    Input : row
            doc_name : document name
            wordDict : empty dictionary, inverted index empty
            distDocDict : empty dictionary , will contain the distance of every documents
    
    Output : wordDict : dictionary, the inverted index
             distDocDict : dictionary contain the distance of every documents"""
    # we select only the description and the title
    row = row[5]+row[8]
    # we replace '\','.','\\n' by a space
    replaced = re.sub(r'[\/\.]', ' ', row.replace("\\n"," "))
    row = tokme(replaced)
    for word in row:
        # fill the dictionary of distances
        if doc_name in distDocDict:
            distDocDict[doc_name].append(row.count(word))
        else:
            distDocDict[doc_name] = [row.count(word)]
        # fill the inverted index
        if word in wordDict :
        # create a tuple : (document name, number of time the word appear in the doc, 0 but will be replace by the TFIDF)
            if wordDict[word][-1][0] == doc_name: # if last doc of the list for the word is the current doc
                # add 1 to the number of time the word appear in the doc
                wordDict[word][-1] = (wordDict[word][-1][0], wordDict[word][-1][1]+1, 0)
            else:
                # create the tuple
                wordDict[word].append((doc_name, 1, 0))#doc_name, nb of word, FdIdf
        else:
            wordDict[word] = [(doc_name, 1, 0)] #doc_name, nb of word, FdIdf
    return (wordDict, distDocDict)

In [17]:
def calcTFIDF(wordDict, nbDoc):
    """Calculate the TFIDF for every word and every document
    Input : 
        wordDict : inverted index without the TFIDF
        nbDoc : number of documents
    
    Output : 
        wordDict : inverted index with the TFIDF"""
    for word, listDoc in wordDict.items():
        # for every list of tuple (document name, number of apearence of the word in the document, TFIDF)
        listDoc2 = [(t[0], t[1], math.log10(nbDoc/len(listDoc))*t[1]) for t in listDoc]
        wordDict[word] = listDoc2
    return wordDict

In [18]:
def calcDistDoc(distDocDict):
    """Calculate the distance of every documents
    Input : Dictionary 
            key : document name
            value : list (int) of number of apearance of every word in the document
            
    Output : Dictionary
            key : document name
            value : distance of the document"""
    distDocDict2 = {doc : math.sqrt(sum(map(lambda x:x**2,l))) for doc,l in distDocDict.items()}
    return distDocDict2

In [19]:
def saveVoc2(wordDict):
    """Save the inverted index and the vocabulary"""
    # save the inverted index to json
    with open('wordDict2.json', 'w') as fp:
        json.dump(wordDict, fp)
    # save all the words in a vocabulary file
    with open('vocabulary2.csv', 'w') as fv:
        wr = csv.writer(fv, quoting=csv.QUOTE_ALL)
        vocabulary = wordDict.keys()
        wr.writerow(vocabulary)

In [20]:
def saveDocDist(distDocDict):
    """Save the dictionary of the distance of the documents"""
    with open('distDocDict.json', 'w') as fp:
        json.dump(distDocDict, fp)

In [21]:
def preprocessingData2():
    """Preprocessing the data"""
    fcsv = open('Airbnb_Texas_Rentals.csv','r',errors='ignore')
    wordDict = {} # inverted index
    distDocDict = {} # dictionary of the distance for every file
    nbDoc = 0 # number of documents
    with fcsv as csvin:
        csvin = csv.reader(csvin)
        for i,row in enumerate(csvin):
            # we don't take the first line of the CSV file
            if i == 0: continue
            doc_name = "doc_"+str(i)+".tsv"
            createTSV2(row, doc_name) # create the TSV files
            wordDict, distDocDict = createWordDict2(row, doc_name, wordDict, distDocDict) # fill the dictionaries
            nbDoc = i
        wordDict = calcTFIDF(wordDict, nbDoc) # compute the TFIDF
        distDocDict = calcDistDoc(distDocDict) # compute the distance of every documents
    saveVoc2(wordDict) # save the inverted index
    saveDocDist(distDocDict) # save the dictionary of distance

In [22]:
preprocessingData2()

In [23]:
def getQuery2(query):
    """Remove ponctuation and stopwords from the query and return it under the form of a list"""
    listQuery = tokme(query)
    return(listQuery)

In [24]:
def loadJSONDict2():
    """Load the inverted index"""
    json1_file = open('wordDict2.json')
    json1_str = json1_file.read()
    json1_data = json.loads(json1_str)
    return json1_data

In [25]:
def loadDocDist():
    """Load the file containing the distances"""
    json1_file = open('distDocDict.json')
    json1_str = json1_file.read()
    json1_data = json.loads(json1_str)
    return json1_data

In [26]:
def calcDist(listnbWord):
    """return the sum of square of all the element in a list. Use to calculate the distance of the query
    Input : list of int
    
    Output : int"""
    return math.sqrt(sum(map(lambda x:x**2,listnbWord)))

In [27]:
def searchQuery2(query, wordDict):
    """We search every documents that contain every word of the query. If a word is not in the documents, 
    we put it in a list and continue the search
    
    Input : query, inverted index
    
    Output : 
        result : list of documents that contain every word of the query
        wordsNotFound : list of the words of the query that are not in the inverted index
        distQuery : distance of the query"""
    result = [] # list of documents that contain every word of the query
    wordsNotFound = [] # list of the words of the query that are not in the inverted index
    nbWord = [] # list of the count of every word in the query
    querySet = list(set(query))
    for word in querySet:
        nbWord.append(query.count(word))
         # case word not in dict
        if word not in wordDict:
            wordsNotFound.append(word)
            continue
        if result != []:
            # get every document name of the list of the world in the inverted index
            lDoc = [t[0] for t in wordDict[word]]
            # the intersection between the list and the result
            result = list(set(lDoc) & set(result))
        else:
            result = [t[0] for t in wordDict[word]]
    # Compute the distance of the query
    distQuery = calcDist(nbWord)
    return(result, wordsNotFound, distQuery)

In [28]:
def calcAllCosineSim(results, query, distQuery, wordDict, distDocDict):
    """Compute the cosine similarity between the query and all the documents of the result
    Input : query
            distQuery : distance of the query
            wordDict : inverted index
            distDocDict : dictionary of the distance of every documents
            
    Output : dictionary of the cosine similarity (key : document name, value cosine similarity)"""
    cosineSimDict = {} # dictionary with key is the document name and the value is the cosine similarity
    for doc in results:
        lValue = sum([wordDict[word] for word in list(set(query))], [])
        TFIDF = [t[2] for t in lValue if t[0]==doc] # get the TFIDF of the document
        cosineSim = calcCosineSim(TFIDF, distDocDict[doc], distQuery) # compute the cosine similarity
        cosineSimDict[doc] = cosineSim # put it in the dictionary
    return cosineSimDict

In [29]:
def calcCosineSim(TFIDF, distDoc, distQuery):
    """return the cosine similarity between a document and the query"""
    return(sum(TFIDF)/(distDoc*distQuery))

In [30]:
def printResult2(results, wordsNotFound, cosineSimDict):
    """Print the best k results (here k=10) and the worlds not found"""
    # Create a DataFrame which will contains all the results
    df = pd.DataFrame(index=range(len(results)), columns=["City", "Description", "Title", "Link", "cosineSim"])
    for i,r in enumerate(results):
        # load the document in a DataFrame (dfd)
        dfd = pd.read_csv('Docs/'+r, sep='\t', usecols=[3, 5, 8, 9], header=None, names=["City", "Description", "Title", "Link"], encoding="latin-1")
        dfd['cosineSim'] = pd.Series(cosineSimDict[r]) # add the cosine similarity to the dfd
        df.loc[i] = dfd.loc[0] # add it to the big DataFrame
    df = df.sort_values(by='cosineSim', ascending=False) # Sort the DataFrame by the cosine Similarity
    print(df.head(10)) # print the k best results, here k=10
    # print the words that have not been found
    if wordsNotFound != []:
        print ("Words that the documents don't contain : " + ", ".join(wordsNotFound))

In [31]:
def SearchEngine2(query):
    """Search the result for a query and print it
    Input : query"""
    wordDict = loadJSONDict2() # load the inverted index
    distDocDict = loadDocDist() # load the dictionary of the distances
    query = getQuery2(query)
    results, wordsNotFound, distQuery = searchQuery2(query, wordDict) # research for the query
    # calculate the cosine similarity between the documents of the result and the query
    cosineSimDict = calcAllCosineSim(results, query, distQuery, wordDict, distDocDict)
     # print the result
    printResult2(results, wordsNotFound, cosineSimDict)

In [33]:
query = input()
SearchEngine2(query)

a beautiful house with garden
           City                                        Description  \
10       Austin  A beautiful, modern home, surrounded by a beau...   
13       Austin  A beautiful, modern home, surrounded by a beau...   
15  Duncanville  Welcome to Alla's Garden House! \nBeautiful pr...   
7        Spring  Attractions: The Woodlands, incredible views, ...   
6        Spring  Beautiful 2100 sq. ft house! Three bedroom and...   
11       Dallas  Backyard paradise close to Plano. I have creat...   
4    Fort Worth  Explore the City of Cowboys and Culture from t...   
8    Fort Worth  Explore the City of Cowboys and Culture from t...   
1       Helotes  Renovated historic barn with concrete floors, ...   
0          Kyle  Three room house situated in the Hill Country ...   

                                                Title  \
10   Stay at an architect's gorgeous home and garden!   
13   Stay at an architect's gorgeous home and garden!   
15                Alla's Gar

# Step 4: Define a new score!


In this part we will try to create new score to evaluate the compliance of documents with the request.
According to the conditions of this task first we should obtain a set of documents using __Search engine__ from the step __3.1__. Then compute the new score for each of this documents. We are not allowed to use the "description" and the "title" in each doc in the way we did before.

In [15]:
def loadWordDict():
    with open('wordDict.json', 'r', encoding='latin-1') as fh:
        return json.load(fh)

This function below computes the new score for the documents obtained from the output of SearchQuery() function. We are using 
information about the average price per night, number of rooms, city, date of publication and title.
New score for each docment is computed with further formula:
<br><br>$TotalScore = CityScore*0.1 + PriceScore*0.15 + TitleScore*0.2 + BedScore*0.25 + DateScore*0.3$ <br><br>
    __CityScore__ gives weigth of 0.1 to the apartments which are located in the city mentioned in the query. This is a binaru score.<br>
    __PriceScore__ computes score according to average price of the apartments:
<br>$1-\frac{averege-price-of-the-apartment-per-night}{sum-of-avprice-for-all-apartemnts-in-output}$<br><br>
    __TitleScore__ computes score based on the similarity between the clients request (query) and the documents title:<br><br>$\frac{len(Query\, \cap\, Title)}{len(Query\, \cap\, Title)}$<br><br>
    __BedScore__ is a binary score which adds 0.25 to the total weight if the apartment consist of the necessary number of beds or bedrooms indicated in the query.<br>
    __DateScore__ gives more weight to the new publications rather than old:
<br><br>$\frac{index-of-the-publication-in-the-sorted-list-of-publications\,+\,1}{len(list-of-publications)}$<br><br>

In [16]:
def newScore(result, query):
    resdict={}
    for r in result:# take doc name from result list
        with open ('Docs/'+r, "r") as myfile: #read string from the tsv fileto list
            lst = myfile.readlines()[0].split(sep="\t") 
            lst[1]=lst[1].replace('$','').replace(' ','') # preprocess data
            lst[2]=lst[2].replace('Studio','0')
            if len(lst[1])>0:lst[1]=int(lst[1])
            else:lst[1]=0
            resdict[r]=[lst[1], lst[2],lst[3],lst[4],lst[8]] #save it in dictionary
    # convert dict to Dataframe    
    resdf = pd.DataFrame.from_dict(resdict, orient='index', columns=["price","bedroom","city","date","title"])
    #create dataframe for score computing
    nsdf = pd.DataFrame(0, index=result, columns=["price","bedroom","city","date","title","totalscore"])
    newresult=[]
    for r in result:#compute all the elements of the totalscore in this loop
        #CityScore
        if len(set(tokme(resdf.loc[r, "city"]))&set(query))>0: nsdf.loc[r, "city"] = 0.1
        #PriceScore
        if resdf.loc[r, "price"]!=0: nsdf.loc[r, "price"] = (1-resdf.loc[r, "price"]/resdf['price'].sum())*0.15 
        #TitleScore
        nsdf.loc[r, "title"] = len(set(query)&set(tokme(resdf.loc[r, "title"])))/len(set(query)|set(tokme(resdf.loc[r, "title"]))) * 0.2
        #BedScore
        #Here we should preprocess the query
        listquery = [i.replace("studio", "0").replace("one", "1").replace("two", "2").replace("three", "3").replace("four", "4")\
                     .replace("five", "5").replace("six", "6").replace("seven", "7").replace("eight", "8")\
                     .replace("nine", "9").replace("ten", "10").replace("elven", "11").replace("twelve", "12")\
                     .replace("thirteen", "13") for i in query]
        p = re.compile('[0-13]\sbed*').findall(" ".join(listquery))
        if len(p)>0 and str(resdf.loc[r,"bedroom"]) == (p[0].split(" "))[0]:  nsdf.loc[r,"bedroom"]=0.25
        #DateScore
        my_date = (resdf.sort_values("date", axis=0, ascending=False, inplace=False, na_position='last'))["date"].tolist()
        nsdf.loc[r,"date"] = (my_date.index(resdf.loc[r,"date"])+1)/len(my_date)*0.3
    #TotalScore
    nsdf["totalscore"] = nsdf.sum(axis=1)
    nsdf = nsdf.sort_values(by='totalscore', ascending=False, kind="heapsort")
    #Let`s see what was coputed...
    display(HTML(nsdf.to_html()))
    result=[(doc, nsdf.loc[doc,"totalscore"]) for doc in result]
    k=len(result)
    if k>5:k=5
    return nlargest(k, result, key=lambda e:e[1])
#As a result we return a list of tuples which include no more than 5 best matches computed with new score
#Sorting is implented with heap data structure

In [17]:
def searchQuery(query, wordDict):
    result = []
    for word in query:
        if result != []:
            result = list(set(wordDict[word]) & set(result))
        else:
            result = wordDict[word]
    return result

In [18]:
def printResult(results):
    df = pd.DataFrame(index=range(len(results)), columns=["Title", "Description", "City", "Link"])
    for i,r in enumerate(results):
        dfd = pd.read_csv('Docs/'+r, sep='\t', usecols=[8, 5, 3, 9], header=None, names=["City", "Description", "Title", "Link"])
        df.loc[i] = dfd.loc[0]
    display(HTML(df.to_html()))

In [19]:
def SearchEngine(query):
    wordDict = loadWordDict()
    query = tokme(query)
    results = searchQuery(query, wordDict)
    newresult = newScore(results, query)
    printResult(results)

In [20]:
query = "one bedroom beautiful spacious Houston"
try:
    SearchEngine(query)
except:
    print('Sorry...Cannot find apartments for your query.')

Unnamed: 0,price,bedroom,city,date,title,totalscore
doc_15405.tsv,0.107547,0.25,0.1,0.2,0.044444,0.701992
doc_16307.tsv,0.107547,0.25,0.1,0.2,0.044444,0.701992
doc_11861.tsv,0.084906,0.25,0.1,0.1,0.0,0.534906


Unnamed: 0,Title,Description,City,Link
0,Very Large Beautiful En-Suite Master Bedroom,"This listing is for the Master En-Suite, featu...",Houston,https://www.airbnb.com/rooms/14500988?location...
1,Very Large Beautiful En-Suite Master Bedroom,"This listing is for the Master En-Suite, featu...",Houston,https://www.airbnb.com/rooms/14500988?location...
2,Modern Uptown Apartment,This is a new apartment with spacious living r...,Houston,https://www.airbnb.com/rooms/19364154?location...


# Bonus Step: 

***Make a nice visualization!***

In [54]:
import folium
import geopy
import numpy as np
from geopy import distance
from geopy import Point
import math

In [55]:
#give coordinates as input and a maximum distance from the coordinates (radius)
lat = float(input())
long = float(input())
coordinates = [lat,long]
radius = float(input())

30
-95
2


In [56]:
#Generate a map, with a circle of the given radius, where the center is represented by the coordinates given in input.
m = folium.Map(
    location=[lat, long]
)

folium.Marker(
    [lat, long], 
    popup = 'Input Position' , 
    icon=folium.Icon(icon='icon', color='blue')
).add_to(m)

folium.Circle(
    location = [lat,long], 
    radius = radius*1000, 
    color = '#3186cc', 
    fill = True, 
    fill_color='#3186cc'
).add_to(m)


<folium.vector_layers.Circle at 0x1a2842ed30>

In [57]:
#Shows the houses that are inside the circle of the given radius. 
#We need to calculate the distance between our coordinates and all other positions, 
#and to add a mark only on the houses inside our radius.
Fcsv = pd.read_csv('Airbnb_Texas_Rentals.csv', sep = ",")
Fcsv = Fcsv[np.isfinite(Fcsv['latitude'])]
Fcsv = Fcsv[np.isfinite(Fcsv['longitude'])]
for index, row in Fcsv.iterrows():
        coord = [row['latitude'], row['longitude']]
        dist = distance.distance(coord, coordinates).kilometers
        if dist <= radius:
            folium.Marker(coord, popup = f"{row['title']}", icon=folium.Icon(icon='icon', color='blue')).add_to(m)
m

    

In [58]:
m.save('index.html')