# Libraries

In [1]:
#%% LIBRARIES
import pandas as pd
import numpy as np

from collections import defaultdict
#import re
#import nltk
import functions
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
#from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer

from sklearn.metrics.pairwise import cosine_similarity
from heapq import heappush, nlargest

#import enchant 
import pickle

In [None]:
import importlib
importlib.reload(functions)

# Step 1: Data

In [3]:
airbnb_data=pd.read_csv("Airbnb_Texas_Rentals.csv",usecols=['average_rate_per_night', 'bedrooms_count', 'city',
       'date_of_listing', 'description', 'latitude', 'longitude', 'title','url'],parse_dates=['date_of_listing'])

In [4]:
airbnb_data.columns

Index(['average_rate_per_night', 'bedrooms_count', 'city', 'date_of_listing',
       'description', 'latitude', 'longitude', 'title', 'url'],
      dtype='object')

# Step 2: Create documents

In [5]:
airbnb_data.head(2)

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,2016-05-01,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,$149,4,San Antonio,2010-11-01,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...


In [6]:
airbnb_data.shape

(18259, 9)

# Clean data

In [7]:
# Check null values of the dataset
airbnb_data.isnull().sum()
#average_rate_per_night -> replace NAN with 0, convert to int
#bedrooms_count -> There are only 3 records so we decided to replace NAN with a category based on the desciption if it's possible. 
#description, latitude, longitude, title -> replace NAN to 'Unknown'

average_rate_per_night    28
bedrooms_count             3
city                       0
date_of_listing            0
description                2
latitude                  34
longitude                 34
title                      3
url                        0
dtype: int64

In [8]:
airbnb_data=functions.clean(airbnb_data)
airbnb_data.isnull().sum()

average_rate_per_night    0
bedrooms_count            0
city                      0
date_of_listing           0
description               0
latitude                  0
longitude                 0
title                     0
url                       0
dtype: int64

In [9]:
airbnb_data.shape

(11532, 9)

#method is run only once at the beginning to make separate .tsv files
functions.create_tsv_documents(airbnb_data)

# Preprocessing

1) Removing stop words

2) Removing punctuation

3) Stemming

##### remove non english words__?
##### should we remove numbers__?

# 3.1) Conjunctive query

## 3.1.1) Create your index!

#Building a vocabulary
vocabulary,doc_vocabs=functions.build_vocabulary(airbnb_data)
len(vocabulary) #11717
#called only once
pickle.dump(vocabulary, open("vocabulary.p", "wb"))  # save it 

# Compute an inverted index

#compute an inverted index
inverted_idx=functions.compute_inverted_idx(doc_vocabs,vocabulary)
pickle.dump(inverted_idx, open("inverted_idx.p", "wb"))  # save it 

In [10]:
vocabulary = pickle.load(open("vocabulary.p", "rb"))
inverted_idx = pickle.load(open("inverted_idx.p", "rb"))

# 3.1.2) Execute the query

In [11]:
functions.search_engine(vocabulary,inverted_idx)# sun bedroom private HDTV

sun fun town
[9265, 9188]


Unnamed: 0,city,description,title,url
0,Bluffton,"Enjoy some fun in the sun right on Lake Buchanan! Come enjoy great bird-watching, fishing, the local winery, small-town antiquing, kayaking, or just relax in this perfect tranquil weekend getaway. Youll love staying in this serene waterfront location. Hop in a boat and paddle 15 minutes to the nearby island!\n\nThe Hancock Lake Cottage is an ideal spot for downtime and a fun-filled vacation in the hill country.","Hancock Lake Cottage ""Mermaid Ave"" on the Water!",https://www.airbnb.com/rooms/13509184
0,Kingsland,"Beautifully decorated home on Lake LBJ just 3-4 minutes from BIG water. Located on a no-wake cove near section of lake where the Colorado and Llano come together making it an ideal location to enjoy fishing, boating, skiing or simply relax and enjoy the lake. The town of Kingsland and nearby areas offer popular activities such as a wine tour, 18 holes of golf, horseback riding, ATV trails, visiting a state park, spelunking at Longhorn Caverns.",Fun In The Sun,https://www.airbnb.com/rooms/17264458


# 3.2) Conjunctive query & Ranking score 

## 3.2.1) Inverted index 

### Calculation of tf-idf values

In [12]:
tf_idf_dic=functions.calculate_tf_idf(airbnb_data,inverted_idx,vocabulary)

In [13]:
inverted_idx2=functions.compute_inverted_idx2(inverted_idx,vocabulary,tf_idf_dic)
inverted_idx2[0]

[(6002, 0.13173071913744075)]

In [None]:
#pickle.dump(inverted_idx2, open("inverted_idx2.p", "wb"))  # save it into a file # for future usage it can be just imported 

## 3.2.2) Execute the query

In [None]:
inverted_idx2 = pickle.load(open("inverted_idx2.p", "rb"))

In [22]:
functions.search_engine2(5,vocabulary,inverted_idx,inverted_idx2)
       # sun private room

sun fun town
[9265, 9188]


Unnamed: 0,title,description,city,url,similarity
0,Fun In The Sun,"Beautifully decorated home on Lake LBJ just 3-4 minutes from BIG water. Located on a no-wake cove near section of lake where the Colorado and Llano come together making it an ideal location to enjoy fishing, boating, skiing or simply relax and enjoy the lake. The town of Kingsland and nearby areas offer popular activities such as a wine tour, 18 holes of golf, horseback riding, ATV trails, visiting a state park, spelunking at Longhorn Caverns.",Kingsland,https://www.airbnb.com/rooms/17264458,0.98
0,"Hancock Lake Cottage ""Mermaid Ave"" on the Water!","Enjoy some fun in the sun right on Lake Buchanan! Come enjoy great bird-watching, fishing, the local winery, small-town antiquing, kayaking, or just relax in this perfect tranquil weekend getaway. Youll love staying in this serene waterfront location. Hop in a boat and paddle 15 minutes to the nearby island!\n\nThe Hancock Lake Cottage is an ideal spot for downtime and a fun-filled vacation in the hill country.",Bluffton,https://www.airbnb.com/rooms/13509184,0.94


In [None]:
# rating, if she is verified