# Libraries

In [9]:
#%% LIBRARIES
import pandas as pd
import numpy as np

from collections import defaultdict
#import re
#import nltk
import functions
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
#from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer

from sklearn.metrics.pairwise import cosine_similarity
from heapq import heappush, nlargest

#import enchant 
import pickle

import importlib

importlib.reload(functions)

# Step 1: Data

In [10]:
airbnb_data=pd.read_csv("Airbnb_Texas_Rentals.csv",usecols=['average_rate_per_night', 'bedrooms_count', 'city',
       'date_of_listing', 'description', 'latitude', 'longitude', 'title','url'],parse_dates=['date_of_listing'])

In [11]:
airbnb_data.columns

Index(['average_rate_per_night', 'bedrooms_count', 'city', 'date_of_listing',
       'description', 'latitude', 'longitude', 'title', 'url'],
      dtype='object')

# Step 2: Create documents

In [12]:
airbnb_data.head(2)

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,2016-05-01,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,$149,4,San Antonio,2010-11-01,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...


In [13]:
airbnb_data.shape

(18259, 9)

# Clean data

In [14]:
# Check null values of the dataset
airbnb_data.isnull().sum()
#average_rate_per_night -> replace NAN with 0, convert to int
#bedrooms_count -> There are only 3 records so we decided to replace NAN with a category based on the desciption if it's possible. 
#description, latitude, longitude, title -> replace NAN to 'Unknown'

average_rate_per_night    28
bedrooms_count             3
city                       0
date_of_listing            0
description                2
latitude                  34
longitude                 34
title                      3
url                        0
dtype: int64

In [15]:
airbnb_data=functions.clean(airbnb_data)
airbnb_data.isnull().sum()

average_rate_per_night    0
bedrooms_count            0
city                      0
date_of_listing           0
description               0
latitude                  0
longitude                 0
title                     0
url                       0
dtype: int64

In [16]:
airbnb_data.shape

(18259, 9)

#method is run only once at the beginning to make separate .tsv files
functions.create_tsv_documents(airbnb_data)

# Preprocessing

1) Removing stop words

2) Removing punctuation

3) Stemming

##### remove non english words and words Giulia chooses (room, price, airbnb) MOST often ones_?
##### should we remove numbers__?

# 3.1) Conjunctive query

## 3.1.1) Create your index!

#Building a vocabulary
vocabulary,doc_vocabs=functions.build_vocabulary(airbnb_data)

In [21]:
len(vocabulary) #11717

11717

#called only once
functions.save_vocabulary(vocabulary,'vocabulary')

# Compute an inverted index

#compute an inverted index
inverted_idx=functions.compute_inverted_idx(doc_vocabs,vocabulary)

In [23]:
#functions.save_inverted_idx(inverted_idx)
#inverted_index=functions.load_inverted_idx()

In [24]:
vocabulary = pickle.load(open("vocabulary.p", "rb"))
inverted_idx = pickle.load(open("inverted_idx.p", "rb"))

In [25]:
# we can clean our vocab more for 2nd part

# 3.1.2) Execute the query

In [27]:
functions.search_engine(vocabulary,inverted_idx)# sun bedroom private HDTV

sun bedroom private HDTV
[9058, 8140, 9988, 15438]


Unnamed: 0,city,description,title,url
0,Dallas,"Spacious Bedroom w Queen-size bed, sun room, and private bathroom in a century-old house near the center of Dallas. 4 miles from Convention Center and beyond via free Dlink, or buses &amp; light rail. 3 Blocks from Bishop Arts District. HDTV/Internet",Vintage House Near Center of Dallas,https://www.airbnb.com/rooms/1488809?location=Cedar%20Hill%2C%20TX
0,Dallas,"Spacious Bedroom w Queen-size bed, sun room, and private bathroom in a century-old house near the center of Dallas. 4 miles from Convention Center and beyond via free Dlink, or buses &amp; light rail. 3 Blocks from Bishop Arts District. HDTV/Internet",Vintage House Near Center of Dallas,https://www.airbnb.com/rooms/1488809?location=Carrollton%2C%20TX
0,Dallas,"Spacious Bedroom w Queen-size bed, sun room, and private bathroom in a century-old house near the center of Dallas. 4 miles from Convention Center and beyond via free Dlink, or buses &amp; light rail. 3 Blocks from Bishop Arts District. HDTV/Internet",Vintage House Near Center of Dallas,https://www.airbnb.com/rooms/1488809?location=Brazos%20River%2C%20TX
0,Dallas,"Spacious Bedroom w Queen-size bed, sun room, and private bathroom in a century-old house near the center of Dallas. 4 miles from Convention Center and beyond via free Dlink, or buses &amp; light rail. 3 Blocks from Bishop Arts District. HDTV/Internet",Vintage House Near Center of Dallas,https://www.airbnb.com/rooms/1488809?location=Arlington%2C%20TX


# 3.2) Conjunctive query & Ranking score 

## 3.2.1) Inverted index 

### Calculation of tf-idf values

In [None]:
#tf_idf_dic=calculate_tf_idf(airbnb_data,inverted_idx,vocabulary)

#inverted_idx2=compute_inverted_idx2(inverted_idx,vocabulary,tf_idf_dic)

#inverted_idx2[0]

#pickle.dump(inverted_idx2, open("inverted_idx2.p", "wb"))  # save it into a file # for future usage it can be just imported 

## 3.2.2) Execute the query

In [69]:
inverted_idx2 = pickle.load(open("inverted_idx2.p", "rb"))

In [70]:
functions.search_engine2(5,vocabulary,inverted_idx,inverted_idx2)# sun private room

sun private room
[17539, 9988, 16391, 17416, 2569, 269, 13205, 2847, 8997, 3367, 8872, 8747, 15918, 6705, 4405, 7102, 12867, 8140, 13773, 15438, 5454, 6875, 11362, 9058, 2151, 872, 1899, 13047, 18163, 17271, 12024]


Unnamed: 0,title,description,city,url,similarity
0,Spacious Private guest room w/sunroom - woman only,"3 minute drive to Mopac and 15 minutes to Downtown, close to Bike Trails and Lady Bird Johnson's Bird Park. My place is a 5 minute walk to boutique-style Escarpment Village Shopping with grocery, a Starbucks, higher end dining, or grab an ice cream or burger.\n\nYoull enjoy the coziness of the private master bedroom, private bath, and French door entry to the sun room, with view of beautiful trees in backyard. My place is great for couples, solo adventurers, or business travelers.",Austin,https://www.airbnb.com/rooms/18868758?location=Bee%20Cave%2C%20TX,0.97
0,Private Room in Warm Boutique Home,"Congratulations on finding the perfect place to stay. We have a Private room with a ceiling fan, full size closet, wireless access, access to swimming pool, washer and dryer, private patio for relaxing in the shade or laying out in the sun, close to Hwy (PHONE NUMBER HIDDEN), and the Dell Campus. austin is 15 minutes away and COTA is 30 minutes.",Round Rock,https://www.airbnb.com/rooms/7672843?location=Coupland%2C%20TX,0.97
0,"Cozy/Private Room, Minutes to Downtown!","Cozy private room, very close to the airport and 7 miles from downtown Austin. The room is simple but equipped with everything you would need for a comfortable stay. Double bed, two bed side tables, full closet to tuck away your belongings in. The full bath is shared with another air bnb listing. Bathroom has toiletries and hairdryer. Feel free to use our living room, dining area and kitchen. We also have a nice back patio great for soaking up sun or night drinking.",Austin,https://www.airbnb.com/rooms/17423956?location=Austin%2C%20TX,0.97
0,"Comfy, Quiet Private Room with Late Check-Out","Our place is very quiet and clean. Comfy queen sized bed and black-out curtain on the window so you can sleep in without with sun waking you.\n\nWe are off the beaten path, in a quiet neighborhood east of Austin. A short drive 15-30 minutes (depending on where you are headed and when) to a ton of great Austin sites and sounds. \n\nOur place is perfect if you are looking for a comfortable, clean, quiet private room.",Austin,https://www.airbnb.com/rooms/16827574?location=Bastrop%20County%2C%20TX,0.96
0,"Comfy, Quiet Private Room with Late Check-Out","Our place is very quiet and clean. Comfy queen sized bed and black-out curtain on the window so you can sleep in without with sun waking you.\n\nWe are off the beaten path, in a quiet neighborhood east of Austin. A short drive 15-30 minutes (depending on where you are headed and when) to a ton of great Austin sites and sounds. \n\nOur place is perfect if you are looking for a comfortable, clean, quiet private room.",Austin,https://www.airbnb.com/rooms/16827574?location=Cedar%20Creek%2C%20TX,0.96


In [75]:
airbnb_data[airbnb_data.url=='https://www.airbnb.com/rooms/16827574'].shape

(2, 9)

In [None]:
# should I do this

In [38]:
airbnb_data.url[1].split('?')[0]

'https://www.airbnb.com/rooms/17481455'

In [41]:
airbnb_data.shape

(18259, 9)

In [71]:
airbnb_data.url=airbnb_data.url.apply(lambda x:x.split('?')[0])

In [68]:
airbnb_data.drop_duplicates(subset='url').shape

(11532, 9)

In [53]:
x=airbnb_data.url.apply(lambda x:x.split('?')[0])

In [54]:
airbnb_data.url[1]

'https://www.airbnb.com/rooms/17481455?location=Cibolo%2C%20TX'