# Libraries

In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict
import re
import nltk

from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer

# Step 1: Data

In [2]:
airbnb_data=pd.read_csv("Airbnb_Texas_Rentals.csv",usecols=['average_rate_per_night', 'bedrooms_count', 'city',
       'date_of_listing', 'description', 'latitude', 'longitude', 'title','url'],parse_dates=['date_of_listing'])

In [3]:
airbnb_data.columns

Index(['average_rate_per_night', 'bedrooms_count', 'city', 'date_of_listing',
       'description', 'latitude', 'longitude', 'title', 'url'],
      dtype='object')

# Step 2: Create documents

In [166]:
airbnb_data.head(1)

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,27,2,Humble,2016-05-01,Welcome to stay in private room with queen bed...,30.0201,-95.294,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...


In [5]:
airbnb_data.shape

(18259, 9)

# Clean data

In [6]:
# Check null values of the dataset
airbnb_data.isnull().sum()
#average_rate_per_night -> replace NAN with 0, convert to int
#bedrooms_count -> There are only 3 records so we decided to replace NAN with a category based on the desciption if it's possible. 
#description, latitude, longitude, title -> replace NAN to 'Unknown'

average_rate_per_night    28
bedrooms_count             3
city                       0
date_of_listing            0
description                2
latitude                  34
longitude                 34
title                      3
url                        0
dtype: int64

In [7]:
airbnb_data.dtypes

average_rate_per_night            object
bedrooms_count                    object
city                              object
date_of_listing           datetime64[ns]
description                       object
latitude                         float64
longitude                        float64
title                             object
url                               object
dtype: object

In [8]:
def clean(airbnb_data):
    """
    Method that removes nan values and imputes them
    
    Input: dataframe
    Output: cleaned dataframe
    
    """
    #replace NAN with 0
    airbnb_data.average_rate_per_night.replace(np.nan, '$0',inplace=True)
    #convert to int and remove $
    airbnb_data.average_rate_per_night=airbnb_data.average_rate_per_night.replace('[\$]', '', regex=True).astype(int)

    #replace NAN with'unknown'

    airbnb_data.description.replace(np.nan,'unknown',inplace=True)
    airbnb_data.title.replace(np.nan,'unknown',inplace=True)

    airbnb_data.latitude.replace(np.nan,'unknown',inplace=True)
    airbnb_data.longitude.replace(np.nan,'unknown',inplace=True)

    #check where bedrooms_count doesn't have a value and save indexes of those records to a list
    null_value_idx=airbnb_data[airbnb_data.bedrooms_count.isnull()].index
    #if the word studio is mentioned in description then it is a studio otherwise 'unknown'
    for idx in null_value_idx:
        if 'studio' in airbnb_data.iloc[idx].description.split():
            airbnb_data.bedrooms_count[idx]='Studio'
        else:
            airbnb_data.bedrooms_count[idx]='unknown'
        
    return airbnb_data

In [9]:
airbnb_data=clean(airbnb_data)
airbnb_data.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


average_rate_per_night    0
bedrooms_count            0
city                      0
date_of_listing           0
description               0
latitude                  0
longitude                 0
title                     0
url                       0
dtype: int64

In [10]:
airbnb_data.shape

(18259, 9)

In [11]:
def create_tsv_documents(airbnb_data):
    """
    Method that creates different .tsv files for each record in the airbnb_data 
    
    Input: dataframe
    """   
    #clean data
    airbnb_data=clean(airbnb_data)
    
    #for each index make a dataframe of airbnb_data and store it into new tsv file
    for i in airbnb_data.index:
        pd.DataFrame(airbnb_data.loc[i]).transpose().to_csv('data/doc_'+str(i)+'.tsv',sep='\t')

#method is run only once at the beginning to make separate .tsv files
create_tsv_documents(airbnb_data)

# Preprocessing

1) Removing stop words

2) Removing punctuation

3) Stemming

##### remove non english words and words Giulia chooses (room, price, airbnb) MOST often ones_?
##### should we remove numbers__?

# 3.1) Conjunctive query

## 3.1.1) Create your index!

In [12]:
#Building a vocabulary

#set for vocabulary (values of the set will be the keys fo vocabulary_dict)
vocabulary_set=set()
#building a dictionary which will be used for making an inverted index
vocabulary_dict=defaultdict(list)

In [13]:
def preprocessing_text(df):
    #remove upper cases
    df=df.lower()
    #replacing new line sign '\n' with a whitespace ' '    
    df=df.replace('\\n',' ')

    #removing stop words and punctuation
    stop_words = set(stopwords.words('english')) 

    #for removing punctuations
    tokenizer = RegexpTokenizer(r'\w+')
    
    #to tokenize the string
    word_tokens = tokenizer.tokenize(df) 

    #stemming
    ps = PorterStemmer()
    filtered_words = [ps.stem(w) for w in word_tokens if not w in stop_words] 

    #remove non-english words
    
    return filtered_words

In [167]:
#Building a vocabulary

#set for vocabulary (values of the set will be the keys fo vocabulary_dict)
vocabulary_set=set()
#building a dictionary which will be used for making an inverted index
doc_vocabs=defaultdict(list)

for i in range((airbnb_data.shape[0])//8900):
    #take one file
    df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title'],encoding='ISO-8859-1')
    #preprocessing 
    df=df.description[0]+' '+df.title[0]
    filtered_words=preprocessing_text(df)
    for word in filtered_words:
        vocabulary_set.add(word)
    doc_vocabs[i]=list(vocabulary_set)

In [168]:
len(vocabulary_set)

72

In [169]:
vocabulary={}
for k,v in enumerate(vocabulary_set):
    vocabulary[v]='id'+str(k)

In [170]:
len(vocabulary)

72

In [173]:
vocabulary_dataframe=pd.DataFrame()

In [174]:
vocabulary_dataframe['word']=vocabulary.keys()

In [175]:
vocabulary_dataframe.to_csv('vocabulary.csv')

# Compute an inverted index

In [176]:
def compute_inverted_idx(doc_vocabs,vocabulary):
    """
    method that computes an inverted index
    
    input: doc_vocabs(dictionary), vocabulary(dictionary of all unique words, key=term, value=term_id)
    output: inverted_idx(dictionary, key=term_id, value=list of document_ids) 
    """
    #initialize defaultdict for making an inverted index
    inverted_idx = defaultdict(list)
    #in every document look for every word and assign document id to the words which belong to it
    for idx in doc_vocabs.keys():
        for word in doc_vocabs[idx]:
            inverted_idx[vocabulary[word]].append(idx)
    return inverted_idx

## problematico -- saving a file

In [177]:
#Hint: Since you do not want to compute the inverted 
#index every time you use the Search Engine, 
#it is worth to think to store it in a separate file and load it in memory when needed.

# Save a dictionary into a pickle file.
import pickle

pickle.dump(inverted_idx, open("save.p", "wb"))  # save it into a file named save.p

# Load the dictionary back from the pickle file.

inverted_index = pickle.load(open("save.p", "rb"))

# FOR saving the vocabulary into a csv file

# 3.1.2) Execute the query

In [240]:
def search_engine():
    user_query=str(input())
    #input()

    user_query=preprocessing_text(user_query)

    list_term_idx=[]
    result_set=[]
    for word in user_query:
        if word in vocabulary.keys():
            list_term_idx.append(set(inverted_idx[vocabulary[word]]))
        else:
            list_term_idx.append({'x'})
            break
    result_set=list(set.intersection(*list_term_idx))
    if 'x' in result_set or not result_set:
        result_set='No results! Try again!'
    print(result_set)
    result_set=finalize_output(result_set)
    return result_set

In [243]:
search_engine()

room


[0, 1]

In [218]:
user_query='room'
#input()

user_query=preprocessing_text(user_query)

list_term_idx=[]
result_set=[]
for word in user_query:
    if word in vocabulary.keys():
        list_term_idx.append(set(inverted_idx[vocabulary[word]]))
    else:
        list_term_idx.append('x')
        break
result_set=list(set.intersection(*list_term_idx))
if not (result_set):
    result_set='No results! Try again!'
print(result_set)

[0, 1]


In [247]:
df=pd.DataFrame()
for i,val in enumerate(result_set):
    #take one file
    df=df.append(pd.read_csv('data/doc_'+str(val)+'.tsv',sep='\t',usecols=['description','title','city','url'],encoding='ISO-8859-1'))

In [254]:
from tabulate import tabulate

df = pd.DataFrame(np.random.random((4,3)), columns=['A','B','C'])

print(tabulate(df, headers="keys", tablefmt="orgtbl"))

foo
|    |        A |        B |        C |
|----+----------+----------+----------|
|  0 | 0.703786 | 0.563027 | 0.578585 |
|  1 | 0.855807 | 0.795622 | 0.131662 |
|  2 | 0.543194 | 0.163419 | 0.313004 |
|  3 | 0.138488 | 0.836076 | 0.78933  |


In [249]:
df.set_index

Unnamed: 0,index,city,description,title,url
0,0,Humble,Welcome to stay in private room with queen bed...,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,0,San Antonio,"Stylish, fully remodeled home in upscale NW  ...",Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...


In [189]:
vocabulary[word]

'id30'

In [187]:
result_set

{'d', 'i'}

# Testing method on the whole dataframe

In [None]:
vocabulary_set=set()

filtered_words=airbnb_data.description.apply(preprocessing_text)+airbnb_data.title.apply(preprocessing_text)

In [None]:
def make_voc(x):
    x=set(x)
    for word in x:
        vocabulary_set.add(word)

In [None]:
#we should have 11935 words in our voc_set for all files
len(vocabulary_set)

In [None]:
vocabulary_set