In [1]:
import os, os.path
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, regexp_tokenize
import pandas as pd
from collections import Counter
import numpy as np

These are all paths used in notebook ;)

In [90]:
path_all_files = r'C:\Users\Asia\Documents\all_files_1'
path_all_files_doc = r'C:\Users\Asia\Documents\all_files_1\doc_'
path_vocabulary = r'C:\Users\Asia\Documents\vocabulary.txt'
path_inverted_indx = 'C:\Users\Asia\Documents\inverted_indx.txt'
path_inv_indx_tfid = r'C:\Users\Asia\Documents\inverted_indx_tfid.txt'

#  Create documents

Creating the .tsv documents:

In [65]:
with open('Airbnb_Texas_Rentals.tsv', 'r', encoding = "utf8") as all_rev:
    csv_reader = csv.reader(all_rev)
    ind = -1
    for row in all_rev:
        ind += 1
        if ind == 0:
            continue
        # skipping the empty lines
        elif row == '\n':
            ind -= 1
            # we manipulate 'ind' in order to have files named doc_i where i = 1,2,3,..
        else:
            # we store the documents inside a new folder
            new_title = r'fileAIR\doc_' + str(ind-1) + '.tsv'
            csv_writer = csv.writer(open(new_title, 'w', encoding = "utf8"), delimiter ='\t')
            # we skip the first column
            row1 = row.split("\t")[1:]
            csv_writer.writerow(row1)

# Search Engine 1

The **preprocess** function converts words in files.
* removing '\n'
* removing punctuation
* filter the non stopwords
* removing the stem

In [2]:
def preprocess(text):
    text = text.lower()
    # removing '\n'
    text = text.replace('\\n', ' ')
    # removing punctuation
    tokenizer = regexp_tokenize(text, "[\w'\$]+")
    # filter the non stopwords
    filtered = [w for w in tokenizer if not w in stopwords.words('english')]
    ps = PorterStemmer()
    # removing the stem
    filtered = [ps.stem(word) for word in filtered]
    return filtered

In [91]:
len_file = len([x for x in os.scandir(path_all_files)])

In [36]:
import datetime
start = datetime.datetime.now()

vocabulary_set = set()
docs_list = []

for i in range(1, len_file):
    with open(path_all_files_doc + str(i) + '.tsv', 'r', encoding = 'utf8') as csvfile:
        file1 = csv.reader(csvfile, delimiter = '\t')
        columns = [i for i in file1]
        # we want to focus only on description and title
        description = columns[0][4]
        description = preprocess(description)
        title = columns[0][7]
        title = preprocess(title)
        tit_desc = title + description
        # creating the vocabulary
        vocabulary_set.update(tit_desc)
        docs_list.append(set(tit_desc))

print(datetime.datetime.now() - start)

0:03:24.449997


In [37]:
vocabulary = {k:v for v, k in enumerate(vocabulary_set)}

Saving vocabulary to the file 'vocabulary.txt'

In [38]:
voc_file = open(path_vocabulary, 'w', encoding = "utf8")
for term in vocabulary:
    voc_file.write('{0}\t{1}\n'.format(term, vocabulary[term]))

### Creating Inverted Index

In [41]:
from collections import defaultdict 

inv_indx = defaultdict(set)
for idx, text_dict in enumerate(docs_list):
    for word in text_dict:
        id_word = vocabulary[word]
        inv_indx[id_word].add(idx)

Previously we operated on sets as values in dictionary. Now we want to have a list as the value.

In [42]:
for key, value in inv_indx.items():
    inv_indx[key] = list(value)

Saving inv_indx to the .txt file

In [43]:
inv_file = open(path_inverted_indx, 'w', encoding = "utf8")
for id_term in inv_indx:
    docks = inv_indx[id_term]
    d = '\t'.join(map(str, docks))
    inv_file.write('{0}\t{1}\n'.format(id_term, d))
inv_file.close()

The **findTheBestDocuments** is a function which is searching for documents with the all words from a query 

In [44]:
def findTheBestDocuments(docs_list, pre_query):
    test = {k:1 for k in docs_list[0]}
    for sublist_ind in range(1, len(docs_list)):
        for k in docs_list[sublist_ind]:
            try:
                test[k] += 1
            except:
                test[k] = 1
    return [doc_id for doc_id in test if test[doc_id] == len(pre_query)]

**SearchEngine** is a function with the arguments:
* query - The input text from user
* vocab - dictionary (saved in 'vocabulary.txt' file)
* inv_indx - inverted index dictionary (saved in 'inv_ind.txt' file)

The **output** is a list of doc_id's for the best documents. 

In [47]:
def SearchEngine(query, vocabulary, inv_indx):
    pre_query = preprocess(query)
    word_list = []
    for item in pre_query:
        if item not in vocabulary:
            print('No documents found')
            break
        word_id = vocabulary[item]
        word_list.append(word_id)
    #word_list contains the id's of words according to vocabulary file
    
    result_list = []
    for term_id in word_list:
        result_list.append(inv_indx[term_id])
    #result_list contains the id's of documents which contain at least one word from the query
    best_docs = findTheBestDocuments(result_list, pre_query)
    return best_docs

**Test of the SearchEngine:**

In [26]:
query = input()

room garden kitchen


In [48]:
test = SearchEngine(query, vocabulary, inv_indx)
print("Selected documents: ", test)

Selected documents:  [6139, 14591, 15558, 15642]


Table with the whole information from selected files: #TODO: [Someone has to run it]

In [58]:
docs_files = []
for i in test:
    docs_files.append(pd.read_csv(path_all_files_doc + str(i) + '.tsv', sep = '\t'))
docs_list = [[row for row in doc_i] for doc_i in docs_files]
cols = ['1', '2', 'City', '4','Description','6','7','Title','Url']
pd.DataFrame(docs_list, columns = cols)[['Title','Description','City','Url']]

# Search Engine 2

In [60]:
global inv_indx_tfid
inv_indx_tfid = {} #Inverted Index dictionary with the TFIDF scores

def computeTFIDF(freq_dict, doc_id):
    numWords = len(freq_dict)
    for word in freq_dict.keys():
        word_id = vocabulary[word]
        try:
            inv_indx_tfid[word_id].append((doc_id, round(float(freq_dict[word])/numWords, 3)))
        except:
            inv_indx_tfid[word_id] = [(doc_id, round(float(freq_dict[word])/numWords, 3))]
    return 

The **occurenceNum** function calculates the words frequencity in one document and calls **computeTFIDF** to compute the TFID score.

ARGS:
* **index** - the document id number

In [74]:
def occurenceNum(index):
    with open(path_all_files_doc + str(index) + '.tsv', 'r', encoding = "utf8") as doc:
        file = csv.reader(doc, delimiter = '\t')
        columns = [i for i in file]
        description = columns[0][4]
        description = preprocess(description)
        title = columns[0][7]
        title = preprocess(title)
        tit_desc = title + description
        freq_dict = {}
        for w in tit_desc:
            try:
                freq_dict[w] += 1
            except:
                freq_dict[w] = 1
        return computeTFIDF(freq_dict, index)

When you call the **occurenceNum** function inside the loop, the dictionary 'inv_indx_tfid' is created at the same time:

In [76]:
start = datetime.datetime.now()        

for file in range(1, len_file):
    occurenceNum(file)
    
print(datetime.datetime.now() - start)

0:03:17.674112


Saving **the Inverted Index file with TFIDF score** (inverted_indx_tfid.txt):

In [77]:
inv_file_score = open(path_inv_indx_tfid, 'w', encoding = "utf8")
for id_term, docks in inv_indx_tfid.items():
    d = '\t'.join(map(str, docks))
    inv_file_score.write('{0}\t{1}\n'.format(id_term, d))
inv_file_score.close()

Computing **the Cosine Similarity**

In [78]:
def cosine_sim(x, y):
    print('x', x)
    print('y', y)
    dot_product = np.dot(x, y)
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)
    return dot_product / (norm_x * norm_y)

The function **query_tfid** calculates the TFID score for words in a query:

In [79]:
def query_tfid(prep_query_list):
    len_list = len(prep_query_list) 
    return [prep_query_list.count(word)/len_list for word in prep_query_list]

The function **get_tfid** returns the TFID score.

ARGS:
* arg_list - the list which is a value for a given word in 'inv_indx_tfid' dictionary.
* doc_id - the document id for which function returns the tfid score

In [80]:
def get_tfid(arg_list, doc_id):
    for tuple_ in arg_list:
        if tuple_[0] == doc_id:
            return tuple_[1]

**SearchEngine_new** is a function with the arguments:

* query - The input text from user
* vocabulary - dictionary (saved in 'vocabulary.txt' file)
* inv_indx - inverted index dictionary (saved in 'inv_ind.txt' file)
* inv_indx_tfid  - inverted index with the tfid score dictionary (saved in 'inv_ind_tfid.txt' file)

The output is a list of (doc_ids, Cosine Similarity) for the best documents.

In [85]:
def SearchEngine_new(query, vocabulary, inv_indx, inv_indx_tfid):
    query_list = preprocess(query)
    word_list = []
    for item in query_list:
        if item not in vocabulary:
            print('No documents found')
            break
        word_id = vocabulary[item]
        word_list.append(word_id)
        
    result_list = []
    for term_id in word_list:
        result_list.append(inv_indx[term_id])
    selected_docs = findTheBestDocuments(result_list, query_list)
    
    # Calculating the Cosine Similarities
    cos_sim_list = []
    for doc_id in selected_docs:
        
        # Creating the TFID vector for a document
        tfid_vector = []
        
        for word in word_list: 
            g = get_tfid(inv_indx_tfid[word], doc_id)
            tfid_vector.append(g)    
        #tfid_vector = [(test, doc_id) for word in pos_list]
        
        cos_sim_list.append(cosine_sim(query_tfid(word_list), tfid_vector))
    return cos_sim_list

**Test of the SearchEngine_new:**

In [83]:
query = input()

private bedroom


In [86]:
SearchEngine_new(query, vocabulary, inv_indx, inv_indx_tfid)

x [0.5, 0.5]
y [None, None]


TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'