# Libraries

In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict
import re
import nltk

from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer

# Step 1: Data

In [2]:
airbnb_data=pd.read_csv("Airbnb_Texas_Rentals.csv",usecols=['average_rate_per_night', 'bedrooms_count', 'city',
       'date_of_listing', 'description', 'latitude', 'longitude', 'title','url'],parse_dates=['date_of_listing'])

In [3]:
airbnb_data.columns

Index(['average_rate_per_night', 'bedrooms_count', 'city', 'date_of_listing',
       'description', 'latitude', 'longitude', 'title', 'url'],
      dtype='object')

# Step 2: Create documents

In [4]:
airbnb_data.head()

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,2016-05-01,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,$149,4,San Antonio,2010-11-01,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,$59,1,Houston,2017-01-01,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...
3,$60,1,Bryan,2016-02-01,Private bedroom in a cute little home situated...,30.637304,-96.337846,Private Room Close to Campus,https://www.airbnb.com/rooms/11839729?location...
4,$75,2,Fort Worth,2017-02-01,Welcome to our original 1920's home. We recent...,32.747097,-97.286434,The Porch,https://www.airbnb.com/rooms/17325114?location...


In [5]:
airbnb_data.shape

(18259, 9)

# Clean data

In [6]:
# Check null values of the dataset
airbnb_data.isnull().sum()
#average_rate_per_night -> replace NAN with 0, convert to int
#bedrooms_count -> There are only 3 records so we decided to replace NAN with a category based on the desciption if it's possible. 
#description, latitude, longitude, title -> replace NAN to 'Unknown'

average_rate_per_night    28
bedrooms_count             3
city                       0
date_of_listing            0
description                2
latitude                  34
longitude                 34
title                      3
url                        0
dtype: int64

In [7]:
airbnb_data.dtypes

average_rate_per_night            object
bedrooms_count                    object
city                              object
date_of_listing           datetime64[ns]
description                       object
latitude                         float64
longitude                        float64
title                             object
url                               object
dtype: object

In [8]:
def clean(airbnb_data):
    """
    Method that removes nan values and imputes them
    
    Input: dataframe
    Output: cleaned dataframe
    
    """
    #replace NAN with 0
    airbnb_data.average_rate_per_night.replace(np.nan, '$0',inplace=True)
    #convert to int and remove $
    airbnb_data.average_rate_per_night=airbnb_data.average_rate_per_night.replace('[\$]', '', regex=True).astype(int)

    #replace NAN with'unknown'

    airbnb_data.description.replace(np.nan,'unknown',inplace=True)
    airbnb_data.title.replace(np.nan,'unknown',inplace=True)

    airbnb_data.latitude.replace(np.nan,'unknown',inplace=True)
    airbnb_data.longitude.replace(np.nan,'unknown',inplace=True)

    #check where bedrooms_count doesn't have a value and save indexes of those records to a list
    null_value_idx=airbnb_data[airbnb_data.bedrooms_count.isnull()].index
    #if the word studio is mentioned in description then it is a studio otherwise 'unknown'
    for idx in null_value_idx:
        if 'studio' in airbnb_data.iloc[idx].description.split():
            airbnb_data.bedrooms_count[idx]='Studio'
        else:
            airbnb_data.bedrooms_count[idx]='unknown'
        
    return airbnb_data

In [9]:
airbnb_data=clean(airbnb_data)
airbnb_data.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


average_rate_per_night    0
bedrooms_count            0
city                      0
date_of_listing           0
description               0
latitude                  0
longitude                 0
title                     0
url                       0
dtype: int64

In [10]:
airbnb_data.shape

(18259, 9)

In [11]:
def create_tsv_documents(airbnb_data):
    """
    Method that creates different .tsv files for each record in the airbnb_data 
    
    Input: dataframe
    """   
    #clean data
    airbnb_data=clean(airbnb_data)
    
    #for each index make a dataframe of airbnb_data and store it into new tsv file
    for i in airbnb_data.index:
        pd.DataFrame(airbnb_data.loc[i]).transpose().to_csv('data/doc_'+str(i)+'.tsv',sep='\t')

#method is run only once at the beginning to make separate .tsv files
create_tsv_documents(airbnb_data)

# Preprocessing

1) Removing stop words

2) Removing punctuation

3) Stemming

##### remove non english words and words Giulia chooses (room, price, airbnb) MOST often ones_?
##### should we remove numbers__?

# 3.1) Conjunctive query

## 3.1.1) Create your index!

In [12]:
#Building a vocabulary

#set for vocabulary (values of the set will be the keys fo vocabulary_dict)
vocabulary_set=set()
#building a dictionary which will be used for making an inverted index
vocabulary_dict=defaultdict(list)

In [13]:
def preprocessing_text(df):
    #remove upper cases
    df=df.lower()
    #replacing new line sign '\n' with a whitespace ' '    
    df=df.replace('\\n',' ')

    #removing stop words and punctuation
    stop_words = set(stopwords.words('english')) 

    #for removing punctuations
    tokenizer = RegexpTokenizer(r'\w+')
    
    #to tokenize the string
    word_tokens = tokenizer.tokenize(df) 

    #stemming
    ps = PorterStemmer()
    filtered_words = [ps.stem(w) for w in word_tokens if not w in stop_words] 

    #remove non-english words
    
    return filtered_words

In [14]:
#Building a vocabulary

#set for vocabulary (values of the set will be the keys fo vocabulary_dict)
vocabulary_lst=[]
#building a dictionary which will be used for making an inverted index
doc_vocabs=defaultdict(list)

doc_vocabs=defaultdict(list)

for i in airbnb_data.index:
    #take one file
    df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title'],encoding='ISO-8859-1')
    #preprocessing 
    df=df.description[0]+' '+df.title[0]
    filtered_words=preprocessing_text(df)
    temp_vocabulary_set=set()
    for word in filtered_words:
        temp_vocabulary_set.add(word)
    vocabulary_lst.append(temp_vocabulary_set)
    doc_vocabs[i]=list(temp_vocabulary_set)
vocabulary_set=set.union(*vocabulary_lst)

In [15]:
len(vocabulary_set)

11717

In [16]:
#mapping words into integers
vocabulary={}
for k,v in enumerate(vocabulary_set):
    #just for testing
    #vocabulary[v]='id'+str(k)
    vocabulary[v]= k

In [17]:
len(vocabulary)

11717

In [18]:
def save_vocabulary(vocabulary): 
    """
    method that converts vocabulary into a dataframe and saves it into a csv file
    
    input: vocabulary(dictionary, key='term',value='term_id')
    """
    vocabulary_dataframe=pd.DataFrame()
    vocabulary_dataframe['word']=vocabulary.keys()
    vocabulary_dataframe.to_csv('vocabulary.csv')
    del vocabulary_dataframe

In [19]:
save_vocabulary(vocabulary)

# Compute an inverted index

In [20]:
def compute_inverted_idx(doc_vocabs,vocabulary):
    """
    method that computes an inverted index
    
    input: doc_vocabs(dictionary), vocabulary(dictionary of all unique words, key=term, value=term_id)
    output: inverted_idx(dictionary, key=term_id, value=list of document_ids) 
    """
    #initialize defaultdict for making an inverted index
    inverted_idx = defaultdict(list)
    #in every document look for every word and assign document id to the words which belong to it
    for idx in doc_vocabs.keys():
        for word in doc_vocabs[idx]:
            inverted_idx[vocabulary[word]].append(idx)
    return inverted_idx

In [21]:
#compute an inverted index
inverted_idx=compute_inverted_idx(doc_vocabs,vocabulary)

## problematico -- saving a file

In [23]:
#Hint: Since you do not want to compute the inverted 
#index every time you use the Search Engine, 
#it is worth to think to store it in a separate file and load it in memory when needed.

# Save a dictionary into a pickle file.
import pickle

pickle.dump(inverted_idx, open("save.p", "wb"))  # save it into a file named save.p

# Load the dictionary back from the pickle file.

inverted_index = pickle.load(open("save.p", "rb"))

In [32]:
#for example number of documents containing word whose id is 11010
len(inverted_index[11010])

2478

In [37]:
for k in vocabulary.keys():
    if vocabulary[k]==11010:
        print(k)

guest


In [38]:
# we can clean our vocab more for 2nd part

# 3.1.2) Execute the query

In [50]:
def finalize_output(result_set):
    df=pd.DataFrame()
    for i,val in enumerate(result_set):
        pd.set_option('display.max_colwidth', -1)
        df=df.append(pd.read_csv('data/doc_'+str(val)+'.tsv',sep='\t',usecols=['description','title','city','url']
                                 ,encoding='ISO-8859-1',index_col=False))
        df.reset_index().drop('index',axis=1,inplace=True)
    return df

In [51]:
def search_engine():
    user_query=str(input())
    #input()

    user_query=preprocessing_text(user_query)

    list_term_idx=[]
    result_set=[]
    for word in user_query:
        #if word exist in the vocabulary
        if word in vocabulary.keys():
            list_term_idx.append(set(inverted_idx[vocabulary[word]]))
        else:
            list_term_idx.append({'x'})
            break
    result_set=list(set.intersection(*list_term_idx))
    if 'x' in result_set or not result_set:
        result_set='No results! Try again!'
        return result_set
        
    print(result_set)
    result_set=finalize_output(result_set)
    return result_set

In [52]:
search_engine()

room sun beach
[17153, 13025, 3367, 16007, 17832, 17865, 4363, 872, 5805, 239, 18041, 18163, 17652, 13205, 17271, 12024, 14713, 378, 13275, 9087]


Unnamed: 0,city,description,title,url
0,Corpus Christi,"3 bedroom beach condo not but a block from Gulf of Mexico. Spacious rooms, TVs, washer and dryer and 2 car garage. Fully furnished and equipped kitchen. 2 outside patios, one on upper floor with view of water. Hardwood floors which are easy maintenance with trips to and from beach. Bike and chairs in garage for usage. Information manual in entrance hallway counter for directions to all points of interest.",Wilson Beach House. Fun in the SUN,https://www.airbnb.com/rooms/17573576?location=Aransas%20Pass%2C%20TX
0,Aransas Pass,"Vacation rental house is in a quiet central location, perfect for winter Texans. Minutes away from Conn Brown Harbor, Rockport and Port Aransas Beaches. 1700 + sq ft, 3 Bdrms plus Sun Room, 2 Baths, and Sleeps 10. Relax and Enjoy the View from the Screened-in-Porch. Booking by week only June thru Oct , Nov. thru March booking by month only - competitively priced.",Aransas Pass Get-a-Way,https://www.airbnb.com/rooms/16916649?location=Aransas%20Pass%2C%20TX
0,Corpus Christi,"Newly remodeled in March of 2014, our beautiful luxury condo can sleep up to 6 persons (4 adults Max). It has 1 private bedroom with a New Queen bed, 1 bath,two bunkbeds located in the hallway (suitable for children ) and a plush leather sofa sleleper. It is designed for maximum comfort. New stainless steel full size refrigerator and stove, a microwave, New towels, rugs and includes memory foam toppers on all beds including the Sofa Sleeper.\n\nThis is a Christian retreat that offers all of the luxuries that you would wish for while on vacation. All bedding is plush, new, and makes for excellent sleeping after a full day in the sun! A kitchen with a full size stainless steel refrigerator, stove with oven and includes all of the cooking tools you require for a home cooked meal while on the road. The living room has plush large comfortable furniture for relaxing. Your Condo has two wall mounted Flat Screened TV's, one smart TV in the bedroom and one 40 in the living room. Last, but certainly not least is an excellent balcony with 2 bar height Chairs, 2 chairs and a table, so you may sit and relax while overlooking the ocean and beach. The Condo community has two full sized swimming pools, and one kiddie pool. One of the pools is heated during the cooler winter months. There are 3 covered BBQ areas, and plenty of parking. Corpus Christi boasts of World Class Kite Surfing, World Class Fishing, and excellent shopping, bars, and restaurants. Villa del Sol Condominiums are located on the North Beach and with a short walk on the beach you will find quality restaurants, bars, and shopping. You and your travel companions will marvel at the beauty and splendor of the Texas Gulf Coast. Come relax on the Beach, enjoy the Sun, and Ocean, but enjoy your stay in a true HOME away from home in this Tropical Blue Getaway located right on the beach!",Beach Front Condo-Corpus Christi,https://www.airbnb.com/rooms/2857736?location=Corpus%20Christi%2C%20TX
0,Corpus Christi,"Soak up the sun at this spacious and bright 2-bedroom, 2-bathroom Corpus Christi vacation rental condo, just minutes from the beach. With room to comfortably sleep 4, this fully equipped rental features gorgeous balcony views in addition to all of the comforts of home. Enjoy some fishing, take a stroll on the boardwalk, or enjoy a meal at one of the many nearby restaurants. No matter your vacation plans, this condo will help you enjoy Corpus Christi to the fullest!",NEW! 2BR Corpus Christi Condo - Minutes from Beach!,https://www.airbnb.com/rooms/19192987?location=Baffin%20Bay%2C%20TX
0,Corpus Christi,"3 bedroom beach condo not but a block from Gulf of Mexico. Spacious rooms, TVs, washer and dryer and 2 car garage. Fully furnished and equipped kitchen. 2 outside patios, one on upper floor with view of water. Hardwood floors which are easy maintenance with trips to and from beach. Bike and chairs in garage for usage. Information manual in entrance hallway counter for directions to all points of interest.",Wilson Beach House. Fun in the SUN,https://www.airbnb.com/rooms/17573576?location=Baffin%20Bay%2C%20TX
0,Corpus Christi,"Stay with us in this 3 bedroom, 2.5 bath townhouse with your very own personal hot tub oasis on the back patio on North Padre Island. Luxury leather sofa, Lounge on the upper deck in the oversized hammock and electric blinds. Full size kitchen, laundry room, plenty of flat screen TV's throughout. The beach is literally a walk away and you can hear the water call your name. Community pool for those who would rather sun bath all day. Schliterrbahn, Packery Channel for fishing and tons of shops.",Oasis with Hot Tub North Padre 3 bdrm & Loft,https://www.airbnb.com/rooms/16799245?location=Aransas%20Pass%2C%20TX
0,Corpus Christi,"My cozy,well kept, comfortable beach Condo # 1316 can sleep up to 6 persons (4 adults Max). It has1 bedroom, 1 bath, along with two bunkbeds (suitable for children ) and a comfortable sofa sleleper. It is designed for maximum comfort! \n\nThis is an awesome place to retreat and it offers all of the luxuries that you would wish for while on vacation. Newly updated in March 2016, it has new bath towels, rugs, memory foam toppers on beds makes for excellent sleeping after a full day in the sun! Fully stocked kitchen with a full size refrigerator and a stove with an oven. A kitchen that includes all of the cooking tools you require for a home cooked meal while on the road. The living room has comfortable furniture for relaxing. Your Condo has two wall mounted Flat Screened TV's, one in the bedroom and one in the living room. Last, but certainly not least is an excellent balcony with Stunning views!! It has bar height Chairs so you may sit and relax while overlooking the ocean and beach. The Condo community has two full sized swimming pools, and one kiddie pool. One of the pools is heated during the cooler winter months. There are 3 covered BBQ areas, and plenty of parking. \n\nCorpus Christi boasts of World Class Kite Surfing, World Class Fishing, and excellent shopping, bars, and restaurants. Villa del Sol Condominiums are located on the North Beach and with a short walk on the beach you will find quality restaurants, bars, and shopping. You and your travel companions will marvel at the beauty and splendor of the Texas Gulf Coast. Come relax on the Beach, enjoy the Sun, and Ocean, but enjoy your stay in a true HOME away from home in this cozy, charming condo located right on the beach!",Cozy Corpus Beach Condo #1316,https://www.airbnb.com/rooms/13029704?location=Corpus%20Christi%2C%20TX
0,Corpus Christi,"Classy beach front pad. Two balcony views of Gulf of Mexico. Observation nest on third patio with amazing panoramic view of entire beach front. This lovely town home has a private patio with hot tub, grill and room for entertainment. Nestled right across the street from SeaWall. You will fall in love waking up to waves, sand and sun.",E Lee Paradise Pad,https://www.airbnb.com/rooms/19431862?location=Aransas%20Pass%2C%20TX
0,Whitney,"Waterview Cabin has a large fenced yard lot with lots of room for kids and pets, a fire pit, a deck with a small lake view perfect for watching the sun set, and a cozy, comfortable interior. Boat ramps and beaches are just a few minutes drive away!",Family (& Pet) Friendly Lake House,https://www.airbnb.com/rooms/10648577?location=Clifton%2C%20TX
0,Corpus Christi,"My cozy,well kept, comfortable beach Condo # 1316 can sleep up to 6 persons (4 adults Max). It has1 bedroom, 1 bath, along with two bunkbeds (suitable for children ) and a comfortable sofa sleleper. It is designed for maximum comfort! \n\nThis is an awesome place to retreat and it offers all of the luxuries that you would wish for while on vacation. Newly updated in March 2016, it has new bath towels, rugs, memory foam toppers on beds makes for excellent sleeping after a full day in the sun! Fully stocked kitchen with a full size refrigerator and a stove with an oven. A kitchen that includes all of the cooking tools you require for a home cooked meal while on the road. The living room has comfortable furniture for relaxing. Your Condo has two wall mounted Flat Screened TV's, one in the bedroom and one in the living room. Last, but certainly not least is an excellent balcony with Stunning views!! It has bar height Chairs so you may sit and relax while overlooking the ocean and beach. The Condo community has two full sized swimming pools, and one kiddie pool. One of the pools is heated during the cooler winter months. There are 3 covered BBQ areas, and plenty of parking. \n\nCorpus Christi boasts of World Class Kite Surfing, World Class Fishing, and excellent shopping, bars, and restaurants. Villa del Sol Condominiums are located on the North Beach and with a short walk on the beach you will find quality restaurants, bars, and shopping. You and your travel companions will marvel at the beauty and splendor of the Texas Gulf Coast. Come relax on the Beach, enjoy the Sun, and Ocean, but enjoy your stay in a true HOME away from home in this cozy, charming condo located right on the beach!",Cozy Corpus Beach Condo #1316,https://www.airbnb.com/rooms/13029704?location=Aransas%20Pass%2C%20TX
