# Libraries

In [None]:
import pandas as pd
import numpy as np
import functions

from collections import defaultdict

import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
#from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
import enchant 

import pickle

# Step 1: Data

In [None]:
airbnb_data=pd.read_csv("Airbnb_Texas_Rentals.csv",usecols=['average_rate_per_night', 'bedrooms_count', 'city',
       'date_of_listing', 'description', 'latitude', 'longitude', 'title','url'],parse_dates=['date_of_listing'])

In [None]:
airbnb_data.columns

# Step 2: Create documents

In [None]:
airbnb_data.head(2)

In [None]:
airbnb_data.shape

# Clean data

In [None]:
# Check null values of the dataset
airbnb_data.isnull().sum()
#average_rate_per_night -> replace NAN with 0, convert to int
#bedrooms_count -> There are only 3 records so we decided to replace NAN with a category based on the desciption if it's possible. 
#description, latitude, longitude, title -> replace NAN to 'Unknown'

In [None]:
airbnb_data=functions.clean(airbnb_data)
airbnb_data.isnull().sum()

In [None]:
airbnb_data.shape

#method is run only once at the beginning to make separate .tsv files
functions.create_tsv_documents(airbnb_data)

# Preprocessing

1) Removing stop words

2) Removing punctuation

3) Stemming

##### remove non english words and words Giulia chooses (room, price, airbnb) MOST often ones_?
##### should we remove numbers__?

# 3.1) Conjunctive query

## 3.1.1) Create your index!

In [None]:
def build_vocabulary(airbnb_data):
    #set for vocabulary (values of the set will be the keys fo vocabulary_dict)
    vocabulary_lst=[]
    #building a dictionary which will be used for making an inverted index
    doc_vocabs=defaultdict(list)

    for i in airbnb_data.index:
        #take one file
        df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title'],encoding='ISO-8859-1')
        #preprocessing 
        df=df.description[0]+' '+df.title[0]
        filtered_words=preprocessing_text(df)
        temp_vocabulary_set=set()
        for word in filtered_words:
            temp_vocabulary_set.add(word)
        vocabulary_lst.append(temp_vocabulary_set)
        doc_vocabs[i]=list(temp_vocabulary_set)
    vocabulary_set=set.union(*vocabulary_lst)
    #mapping words into integers
    vocabulary={}
    for k,v in enumerate(vocabulary_set):
        vocabulary[v]= k
    return vocabulary

In [None]:
#Building a vocabulary
vocabulary=functions.build_vocabulary(airbnb_data)

In [None]:
len(vocabulary_set) #11717

In [None]:
functions.save_vocabulary(vocabulary,'vocabulary')

# Compute an inverted index

In [None]:
#compute an inverted index
inverted_idx=functions.compute_inverted_idx(doc_vocabs,vocabulary)

In [None]:
functions.save_inverted_idx(inverted_idx)
inverted_index=functions.load_inverted_idx()

In [None]:
#for example number of documents containing word whose id is 11010
len(inverted_index[11010])

In [None]:
for k in vocabulary.keys():
    if vocabulary[k]==11010:
        print(k)

In [None]:
# we can clean our vocab more for 2nd part

# 3.1.2) Execute the query

In [None]:
functions.search_engine(vocabulary,inverted_idx)

In [None]:
# for future usage it can be just imported 
vocabulary=pd.read_csv('vocabulary.csv',encoding='ISO-8859-1')
vocabulary.head()