# Libraries

In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict
import re
import nltk

from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer

# Step 1: Data

In [2]:
airbnb_data=pd.read_csv("Airbnb_Texas_Rentals.csv",usecols=['average_rate_per_night', 'bedrooms_count', 'city',
       'date_of_listing', 'description', 'latitude', 'longitude', 'title','url'],parse_dates=['date_of_listing'])

In [3]:
airbnb_data.columns

Index(['average_rate_per_night', 'bedrooms_count', 'city', 'date_of_listing',
       'description', 'latitude', 'longitude', 'title', 'url'],
      dtype='object')

# Step 2: Create documents

In [4]:
airbnb_data.head()

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,2016-05-01,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,$149,4,San Antonio,2010-11-01,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,$59,1,Houston,2017-01-01,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...
3,$60,1,Bryan,2016-02-01,Private bedroom in a cute little home situated...,30.637304,-96.337846,Private Room Close to Campus,https://www.airbnb.com/rooms/11839729?location...
4,$75,2,Fort Worth,2017-02-01,Welcome to our original 1920's home. We recent...,32.747097,-97.286434,The Porch,https://www.airbnb.com/rooms/17325114?location...


In [5]:
airbnb_data.shape

(18259, 9)

# Clean data

In [6]:
# Check null values of the dataset
airbnb_data.isnull().sum()
#average_rate_per_night -> replace NAN with 0, convert to int
#bedrooms_count -> There are only 3 records so we decided to replace NAN with a category based on the desciption if it's possible. 
#description, latitude, longitude, title -> replace NAN to 'Unknown'

average_rate_per_night    28
bedrooms_count             3
city                       0
date_of_listing            0
description                2
latitude                  34
longitude                 34
title                      3
url                        0
dtype: int64

In [7]:
airbnb_data.dtypes

average_rate_per_night            object
bedrooms_count                    object
city                              object
date_of_listing           datetime64[ns]
description                       object
latitude                         float64
longitude                        float64
title                             object
url                               object
dtype: object

In [8]:
def clean(airbnb_data):
    """
    Method that removes nan values and imputes them
    
    Input: dataframe
    Output: cleaned dataframe
    
    """
    #replace NAN with 0
    airbnb_data.average_rate_per_night.replace(np.nan, '$0',inplace=True)
    #convert to int and remove $
    airbnb_data.average_rate_per_night=airbnb_data.average_rate_per_night.replace('[\$]', '', regex=True).astype(int)

    #replace NAN with'unknown'

    airbnb_data.description.replace(np.nan,'unknown',inplace=True)
    airbnb_data.title.replace(np.nan,'unknown',inplace=True)

    airbnb_data.latitude.replace(np.nan,'unknown',inplace=True)
    airbnb_data.longitude.replace(np.nan,'unknown',inplace=True)

    #check where bedrooms_count doesn't have a value and save indexes of those records to a list
    null_value_idx=airbnb_data[airbnb_data.bedrooms_count.isnull()].index
    #if the word studio is mentioned in description then it is a studio otherwise 'unknown'
    for idx in null_value_idx:
        if 'studio' in airbnb_data.iloc[idx].description.split():
            airbnb_data.bedrooms_count[idx]='Studio'
        else:
            airbnb_data.bedrooms_count[idx]='unknown'
        
    return airbnb_data

In [9]:
airbnb_data=clean(airbnb_data)
airbnb_data.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


average_rate_per_night    0
bedrooms_count            0
city                      0
date_of_listing           0
description               0
latitude                  0
longitude                 0
title                     0
url                       0
dtype: int64

In [10]:
airbnb_data.shape

(18259, 9)

In [11]:
def create_tsv_documents(airbnb_data):
    """
    Method that creates different .tsv files for each record in the airbnb_data 
    
    Input: dataframe
    """   
    #clean data
    airbnb_data=clean(airbnb_data)
    
    #for each index make a dataframe of airbnb_data and store it into new tsv file
    for i in airbnb_data.index:
        pd.DataFrame(airbnb_data.loc[i]).transpose().to_csv('data/doc_'+str(i)+'.tsv',sep='\t')

#method is run only once at the beginning to make separate .tsv files
create_tsv_documents(airbnb_data)

# Preprocessing

1) Removing stop words

2) Removing punctuation

3) Stemming

##### remove non english words and words Giulia chooses (room, price, airbnb) MOST often ones_?
##### should we remove numbers__?

# 3.1) Conjunctive query

## 3.1.1) Create your index!

In [12]:
#Building a vocabulary

#set for vocabulary (values of the set will be the keys fo vocabulary_dict)
vocabulary_set=set()
#building a dictionary which will be used for making an inverted index
vocabulary_dict=defaultdict(list)

In [13]:
def preprocessing_text(df):
    #remove upper cases
    df=df.lower()
    #replacing new line sign '\n' with a whitespace ' '    
    df=df.replace('\\n',' ')

    #removing stop words and punctuation
    stop_words = set(stopwords.words('english')) 

    #for removing punctuations
    tokenizer = RegexpTokenizer(r'\w+')
    
    #to tokenize the string
    word_tokens = tokenizer.tokenize(df) 

    #stemming
    ps = PorterStemmer()
    filtered_words = [ps.stem(w) for w in word_tokens if not w in stop_words] 

    #remove non-english words
    
    return filtered_words

In [14]:
#Building a vocabulary

#set for vocabulary (values of the set will be the keys fo vocabulary_dict)
vocabulary_set=set()
#building a dictionary which will be used for making an inverted index
#vocabulary_dict=defaultdict(list)

doc_vocabs=defaultdict(list)

temp=0
for i in airbnb_data.index:
    #take one file
    df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title'],encoding='ISO-8859-1')
    #preprocessing 
  #  df=df.description[0]+' '+df.title[0]

    df=preprocessing_text(df.description[0])+preprocessing_text(df.title[0])

   # filtered_words=preprocessing_text(df)
    filtered_words=df

    for word in filtered_words:
        vocabulary_set.add(word)
    doc_vocabs[i]=list(vocabulary_set)

In [15]:
len(vocabulary_set)

11717

In [16]:
vocabulary={}
for k,v in enumerate(vocabulary_set):
    vocabulary[v]=k

In [17]:
len(vocabulary)

11717

In [18]:
vocabulary_dataframe=pd.DataFrame()

In [19]:
vocabulary_dataframe['word']=vocabulary.keys()

In [20]:
vocabulary_dataframe.to_csv('vocabulary.csv')

# Compute an inverted index

In [21]:
#compute an inverted index

inverted_idx = defaultdict(list)
for idx, vocab in enumerate(doc_vocabs):
    for word in doc_vocabs[vocab]:
        inverted_idx[vocabulary[word]].append(idx)

In [23]:
# Save a dictionary into a pickle file.
import pickle

pickle.dump(inverted_idx, open("save.p", "wb"))  # save it into a file named save.p

# Load the dictionary back from the pickle file.
import pickle

inverted_index = pickle.load(open("save.p", "rb"))

MemoryError: 

# FOR saving the vocabulary into a csv file

# 3.1.2) Execute the query

In [None]:
user_query=input()

# Testing method on the whole dataframe

In [None]:
vocabulary_set=set()

filtered_words=airbnb_data.description.apply(preprocessing_text)+airbnb_data.title.apply(preprocessing_text)

In [None]:
def make_voc(x):
    x=set(x)
    for word in x:
        vocabulary_set.add(word)

In [None]:
#we should have 11935 words in our voc_set for all files
len(vocabulary_set)

In [None]:
vocabulary_set

#testing on 5 files


vocabulary_set=set()
vocabulary_dict=defaultdict(list)

temp=0
for i in range(5):
    df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',encoding='ISO-8859-1')
    df=df.description[0]+' '+df.title[0]

    filtered_words=preprocessing_text(df)
    
    for word in filtered_words:
        if temp == 0:
            vocabulary_dict[word].append(i)
        elif word in vocabulary_set:
            vocabulary_dict[word].append(i)
        vocabulary_set.add(word)
    temp+=1