# Libraries

In [56]:
import pandas as pd
import numpy as np

from collections import defaultdict
import re
import nltk

from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer

In [57]:
sample=10

# Step 1: Data

In [58]:
airbnb_data=pd.read_csv("Airbnb_Texas_Rentals.csv",usecols=['average_rate_per_night', 'bedrooms_count', 'city',
       'date_of_listing', 'description', 'latitude', 'longitude', 'title','url'],parse_dates=['date_of_listing'])

In [59]:
airbnb_data.columns

Index(['average_rate_per_night', 'bedrooms_count', 'city', 'date_of_listing',
       'description', 'latitude', 'longitude', 'title', 'url'],
      dtype='object')

# Step 2: Create documents

In [60]:
airbnb_data.head(1)

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,2016-05-01,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...


In [61]:
airbnb_data.shape

(18259, 9)

# Clean data

In [62]:
# Check null values of the dataset
airbnb_data.isnull().sum()
#average_rate_per_night -> replace NAN with 0, convert to int
#bedrooms_count -> There are only 3 records so we decided to replace NAN with a category based on the desciption if it's possible. 
#description, latitude, longitude, title -> replace NAN to 'Unknown'

average_rate_per_night    28
bedrooms_count             3
city                       0
date_of_listing            0
description                2
latitude                  34
longitude                 34
title                      3
url                        0
dtype: int64

In [63]:
airbnb_data.dtypes

average_rate_per_night            object
bedrooms_count                    object
city                              object
date_of_listing           datetime64[ns]
description                       object
latitude                         float64
longitude                        float64
title                             object
url                               object
dtype: object

In [64]:
def clean(airbnb_data):
    """
    Method that removes nan values and imputes them
    
    Input: dataframe
    Output: cleaned dataframe
    
    """
    #replace NAN with 0
    airbnb_data.average_rate_per_night.replace(np.nan, '$0',inplace=True)
    #convert to int and remove $
    airbnb_data.average_rate_per_night=airbnb_data.average_rate_per_night.replace('[\$]', '', regex=True).astype(int)

    #replace NAN with'unknown'

    airbnb_data.description.replace(np.nan,'unknown',inplace=True)
    airbnb_data.title.replace(np.nan,'unknown',inplace=True)

    airbnb_data.latitude.replace(np.nan,'unknown',inplace=True)
    airbnb_data.longitude.replace(np.nan,'unknown',inplace=True)

    #check where bedrooms_count doesn't have a value and save indexes of those records to a list
    null_value_idx=airbnb_data[airbnb_data.bedrooms_count.isnull()].index
    #if the word studio is mentioned in description then it is a studio otherwise 'unknown'
    for idx in null_value_idx:
        if 'studio' in airbnb_data.iloc[idx].description.split():
            airbnb_data.bedrooms_count[idx]='Studio'
        else:
            airbnb_data.bedrooms_count[idx]='unknown'
        
    return airbnb_data

In [65]:
airbnb_data=clean(airbnb_data)
airbnb_data.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


average_rate_per_night    0
bedrooms_count            0
city                      0
date_of_listing           0
description               0
latitude                  0
longitude                 0
title                     0
url                       0
dtype: int64

In [66]:
airbnb_data.shape

(18259, 9)

In [67]:
def create_tsv_documents(airbnb_data):
    """
    Method that creates different .tsv files for each record in the airbnb_data 
    
    Input: dataframe
    """   
    #clean data
    airbnb_data=clean(airbnb_data)
    
    #for each index make a dataframe of airbnb_data and store it into new tsv file
    for i in airbnb_data.index:
        pd.DataFrame(airbnb_data.loc[i]).transpose().to_csv('data/doc_'+str(i)+'.tsv',sep='\t')

#method is run only once at the beginning to make separate .tsv files
create_tsv_documents(airbnb_data)

# Preprocessing

1) Removing stop words

2) Removing punctuation

3) Stemming

##### remove non english words and words Giulia chooses (room, price, airbnb) MOST often ones_?
##### should we remove numbers__?

# 3.1) Conjunctive query

## 3.1.1) Create your index!

In [68]:
def preprocessing_text(df):
    #remove upper cases
    df=df.lower()
    #replacing new line sign '\n' with a whitespace ' '    
    df=df.replace('\\n',' ')

    #removing stop words and punctuation
    stop_words = set(stopwords.words('english')) 

    #for removing punctuations
    tokenizer = RegexpTokenizer(r'\w+')
    
    #to tokenize the string
    word_tokens = tokenizer.tokenize(df) 

    #stemming
    ps = PorterStemmer()
    filtered_words = [ps.stem(w) for w in word_tokens if not w in stop_words] 

    #remove non-english words
    
    return filtered_words

In [69]:
#Building a vocabulary

#set for vocabulary (values of the set will be the keys fo vocabulary_dict)
vocabulary_lst=[]
#building a dictionary which will be used for making an inverted index
doc_vocabs=defaultdict(list)

for i in range(sample):
    #take one file
    df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title','city'],encoding='ISO-8859-1')
    #preprocessing 
    df=df.description[0]+' '+df.title[0]+' '+df.city[0]
    filtered_words=preprocessing_text(df)
    temp_vocabulary_set=set()
    for word in filtered_words:
        temp_vocabulary_set.add(word)
    vocabulary_lst.append(temp_vocabulary_set)
    doc_vocabs[i]=list(temp_vocabulary_set)
vocabulary_set=set.union(*vocabulary_lst)

In [70]:
len(vocabulary_set)

194

In [71]:
vocabulary={}
for k,v in enumerate(vocabulary_set):
    #just for testing
    #vocabulary[v]='id'+str(k)
    vocabulary[v]= k

In [72]:
len(vocabulary)

194

In [73]:
def save_vocabulary(vocabulary): 
    """
    method that converts vocabulary into a dataframe and saves it into a csv file
    
    input: vocabulary(dictionary, key='term',value='term_id')
    """
    vocabulary_dataframe=pd.DataFrame()
    vocabulary_dataframe['word']=vocabulary.keys()
    vocabulary_dataframe.to_csv('vocabulary_sample.csv')
    del vocabulary_dataframe

In [74]:
save_vocabulary(vocabulary)

# Compute an inverted index

In [75]:
def compute_inverted_idx(doc_vocabs,vocabulary):
    """
    method that computes an inverted index
    
    input: doc_vocabs(dictionary), vocabulary(dictionary of all unique words, key=term, value=term_id)
    output: inverted_idx(dictionary, key=term_id, value=list of document_ids) 
    """
    #initialize defaultdict for making an inverted index
    inverted_idx = defaultdict(list)
    #in every document look for every word and assign document id to the words which belong to it
    for idx in doc_vocabs.keys():
        for word in doc_vocabs[idx]:
            inverted_idx[vocabulary[word]].append(idx)
    return inverted_idx

In [76]:
inverted_idx=compute_inverted_idx(doc_vocabs,vocabulary)

In [77]:
# Save a dictionary into a pickle file.
import pickle

pickle.dump(inverted_idx, open("inv_idx_sample.p", "wb"))  # save it into a file named save.p

# Load the dictionary back from the pickle file.

inverted_index = pickle.load(open("inv_idx_sample.p", "rb"))

# 3.1.2) Execute the query

In [78]:
def finalize_output(result_set):
    df=pd.DataFrame()
    for i,val in enumerate(result_set):
        pd.set_option('display.max_colwidth', -1)
        df=df.append(pd.read_csv('data/doc_'+str(val)+'.tsv',sep='\t',usecols=['description','title','city','url']
                                 ,encoding='ISO-8859-1',index_col=False))
        df.reset_index().drop('index',axis=1)
    return df

In [79]:
def search_engine():
    user_query=str(input())
    
    user_query=preprocessing_text(user_query)

    list_term_idx=[]
    result_set=[]
    for word in user_query:
        #if word exist in the vocabulary
        if word in vocabulary.keys():
            list_term_idx.append(set(inverted_idx[vocabulary[word]]))
        else:
            list_term_idx.append({'x'})
            break
    result_set=list(set.intersection(*list_term_idx))
    if 'x' in result_set or not result_set:
        result_set='No results! Try again!'
        return result_set
        
    print(result_set)
    result_set=finalize_output(result_set)
    return result_set

In [80]:
search_engine()

room
[0, 1, 2, 3, 7]


Unnamed: 0,city,description,title,url
0,Humble,Welcome to stay in private room with queen bed and detached private bathroom on the second floor. Another private bedroom with sofa bed is available for additional guests. 10$ for an additional guest.\n10min from IAH airport\nAirport pick-up/drop off is available for $10/trip.,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location=Cleveland%2C%20TX
0,San Antonio,"Stylish, fully remodeled home in upscale NW  Alamo Heights Area. \n\nAmazing location - House conveniently located in quiet street, with beautiful seasoned trees, prestigious neighborhood and very close to the airport, 281, 410 loop and down-town area. \n\nFeaturing an open floor plan, original hardwood floors, 3 bedrooms, 3 FULL bathrooms + an independent garden-TV room which can sleep 2 more\n\nEuropean inspired kitchen and top of the line decor. Driveway can park 4 cars.",Unique Location! Alamo Heights - Designer Inspired,https://www.airbnb.com/rooms/17481455?location=Cibolo%2C%20TX
0,Houston,'River house on island close to the city' \nA well maintained river house off the San Jacinto river with extra room for temporary visitors.,River house near the city,https://www.airbnb.com/rooms/16926307?location=Beach%20City%2C%20TX
0,Bryan,Private bedroom in a cute little home situated in the coveted Garden Acres neighborhood in Bryan. The bedroom has its own private access and its own private bathroom.,Private Room Close to Campus,https://www.airbnb.com/rooms/11839729?location=College%20Station%2C%20TX
0,Fort Worth,"This is a beautiful bedroom with a queen size bed and closet. We do not have pets and the house is always clean. The bathroom is shared and supplies such as towels and shampoo are available. We are only some miles from Downtown, TCU, TCC, and Stockyards.",Friendly Private Room in ?Quiet Neighborhood,https://www.airbnb.com/rooms/18977363?location=Cleburne%2C%20TX


# 3.2) Conjunctive query & Ranking score
### 3.2.1) Inverted index
### 3.2.2) Execute the query

# calculate tfldf

1) tf=term frequency -- the frequency of the word in each document in the corpus.

2) idf


In [81]:
#Calc tfidf and cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Calculate IDF

In [82]:
#IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
Total_number_of_documents=airbnb_data.shape[0]

In [83]:
#TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
#IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
#idf=

In [84]:
# TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
tf_dic=dict()
for i in range(sample):
    #take one file
    df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title','city'],encoding='ISO-8859-1')
    #preprocessing 
    df=df.description[0]+' '+df.title[0]+' '+df.city[0]
    filtered_words=preprocessing_text(df)
    tf_series=pd.Series(filtered_words)
    tf_series=(tf_series.value_counts())/len(tf_series)
    for index,value in tf_series.iteritems():
        tf_dic[index,i]=value

In [85]:
#IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
idf_dic=dict()
total_num_docs_sample=sample
for i in range(sample):
    #take one file
    df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title','city'],encoding='ISO-8859-1')
    #preprocessing 
    df=df.description[0]+' '+df.title[0]+' '+df.city[0]
    
    filtered_words=preprocessing_text(df)
 #   for word_k in filtered_words_2: 
    idf_series=pd.Series(list(set(filtered_words)))
    idf_calc=idf_series.apply(lambda x: np.log(total_num_docs_sample/len(inverted_idx[vocabulary[x]])) )
    for idx in range(len(idf_series)):
        idf_dic[idf_series[idx],i]=idf_calc[idx]      
        
  #      lis.append(np.log(total_num_docs_sample/len(inverted_idx[vocabulary[word_k]])))
 

In [86]:
df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title','city'],encoding='ISO-8859-1')
#preprocessing 
df=df.description[0]+' '+df.title[0]+' '+df.city[0]
filtered_words=preprocessing_text(df)

tf_series=pd.Series(filtered_words)
tf_series=((tf_series.value_counts())/len(tf_series)).sort_index()
idf_series=pd.Series(list(set(filtered_words))).sort_values()
idf_calc=idf_series.apply(lambda x: np.log(total_num_docs_sample/len(inverted_idx[vocabulary[x]])))
#idf_calc

In [87]:
#idf_calc.sort_index()
result_df=pd.concat([pd.Series(idf_series.values),pd.Series(tf_series.values),idf_calc],axis=1)#.reset_index()
result_df['res']=result_df[1]*result_df[2]

In [88]:
result_df

Unnamed: 0,0,1,2,res
0,antonio,0.125,0.916291,0.114536
1,cozi,0.125,1.609438,0.20118
2,entranc,0.125,2.302585,0.287823
3,histor,0.125,2.302585,0.287823
4,privat,0.25,2.302585,0.575646
5,san,0.125,1.609438,0.20118
6,studio,0.125,1.203973,0.150497


In [89]:
# TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
tf_idf_dic=dict()
#IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
idf_dic2={}
tf_dic2={}
proba={}
#OVDE
total_num_docs_sample=sample
result_df=pd.DataFrame()
for i in range(sample):
    #take one file
    df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title','city'],encoding='ISO-8859-1')
    #preprocessing 
    df=df.description[0]+' '+df.title[0]+' '+df.city[0]
    filtered_words=preprocessing_text(df)
    tf_series=pd.Series(filtered_words)
    tf_series=((tf_series.value_counts())/len(tf_series)).sort_index()
    idf_series=pd.Series(list(set(filtered_words))).sort_values()
    idf_calc=idf_series.apply(lambda x: np.log(total_num_docs_sample/len(inverted_idx[vocabulary[x]])))
    #idf_calc.sort_index()
    result_df=pd.concat([pd.Series(idf_series.values),pd.Series(tf_series.values),pd.Series(idf_calc.values)],axis=1)#.reset_index()
#    result_df=pd.concat([pd.Series(idf_series.values),pd.Series(tf_series.values),idf_calc],axis=1)#.reset_index()
    result_df['tf_idf']=result_df[1]*result_df[2]
   # result_df=result_df.loc[:,['res',0]]
    
    for idx in range(result_df.shape[0]):
        tf_idf_dic[result_df[0][idx],i]=result_df['tf_idf'][idx]
#del result_df
    for idx in range(len(tf_series)):
#        tf_idf_dic[tf_series.index[idx],i]=tf_series[idx]*idf_calc[idx]
        idf_dic2[idf_series[idx],i]=idf_calc[idx] 
    for index,value in tf_series.iteritems():
        tf_dic2[index,i]=value
    for k in tf_dic2.keys():
        proba[k]=tf_dic2[k]*idf_dic2[k]

In [90]:
result_df

Unnamed: 0,0,1,2,tf_idf
0,antonio,0.125,1.609438,0.20118
1,cozi,0.125,1.609438,0.20118
2,entranc,0.125,2.302585,0.287823
3,histor,0.125,2.302585,0.287823
4,privat,0.25,0.916291,0.229073
5,san,0.125,1.203973,0.150497
6,studio,0.125,2.302585,0.287823


In [91]:
idf_calc.values

array([1.60943791, 1.60943791, 2.30258509, 2.30258509, 0.91629073,
       1.2039728 , 2.30258509])

In [92]:
for key in tf_dic.keys():
    if(tf_dic[key]*idf_dic[key]!=proba[key]):
 #       print(key,tf_dic[key])
#        print(key,idf_dic[key])
        print(key,tf_dic[key]*idf_dic[key])
        print(key,tf_idf_dic[key])
        print("!!!!!!!!!!!!!!!!!!!!!!!")

In [93]:
proba['san',2]

0.05733203830123505

In [94]:
for key in tf_dic.keys():
    if(tf_dic[key]*idf_dic[key]!=tf_idf_dic[key]):
 #       print(key,tf_dic[key])
#        print(key,idf_dic[key])
        print(key,tf_dic[key]*idf_dic[key])
        print(key,tf_idf_dic[key])
        print("!!!!!!!!!!!!!!!!!!!!!!!")

In [96]:
# First way
#TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
#IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

def calculate_tf_idf(airbnb_data):
    tf_idf_dic=dict()
    total_num_docs_sample=sample
    result_df=pd.DataFrame()
    for i in range(sample):
        #take one file
        df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title','city'],encoding='ISO-8859-1')
        #preprocessing 
        df=df.description[0]+' '+df.title[0]+' '+df.city[0]
        filtered_words=preprocessing_text(df)
        tf_series=pd.Series(filtered_words)
        tf_series=((tf_series.value_counts())/len(tf_series)).sort_index()
        idf_series=pd.Series(list(set(filtered_words))).sort_values()
        idf_calc=idf_series.apply(lambda x: np.log(total_num_docs_sample/len(inverted_idx[vocabulary[x]])))
        result_df=pd.concat([pd.Series(idf_series.values),pd.Series(tf_series.values),pd.Series(idf_calc.values)],axis=1)#.reset_index()
        result_df['tf_idf']=result_df[1]*result_df[2]

        for idx in range(result_df.shape[0]):
            tf_idf_dic[result_df[0][idx],i]=result_df['tf_idf'][idx]
    return tf_idf_dic        

In [103]:
# Second way--to check if it is the same like the 1st-for double checking the results
def calculate_tf_idf2(airbnb_data):
    idf_dic2={}
    tf_dic2={}
    proba={}
    for i in range(sample):
        #take one file
        df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title','city'],encoding='ISO-8859-1')
        #preprocessing 
        df=df.description[0]+' '+df.title[0]+' '+df.city[0]
        filtered_words=preprocessing_text(df)
        tf_series=pd.Series(filtered_words)
        tf_series=((tf_series.value_counts())/len(tf_series)).sort_index()
        idf_series=pd.Series(list(set(filtered_words))).sort_values()
        idf_calc=idf_series.apply(lambda x: np.log(total_num_docs_sample/len(inverted_idx[vocabulary[x]])))
       
        for idx in range(len(tf_series)):
            idf_dic2[idf_series[idx],i]=idf_calc[idx] 
        for index,value in tf_series.iteritems():
            tf_dic2[index,i]=value
        for k in tf_dic2.keys():
            proba[k]=tf_dic2[k]*idf_dic2[k]
    return proba        

In [109]:
a=calculate_tf_idf(airbnb_data)

In [110]:
b=calculate_tf_idf2(airbnb_data)

In [114]:
for k in a.keys():
    if(a[k]!=b[k]):
        print('њет')