In [75]:
import datetime
import os, os.path
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, regexp_tokenize
from collections import defaultdict 
import pandas as pd
import numpy as np

In [43]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\alice\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

These are all paths used in notebook.

In [35]:
path_all_files = r'C:\Users\alice\Desktop\ADM_HW3\file_unique'
path_all_files_doc = r'C:\Users\alice\Desktop\ADM_HW3\file_unique\doc_'
path_vocabulary = r'C:\Users\alice\Desktop\ADM_HW3\vocabulary.txt'
path_inverted_indx = r'C:\Users\alice\Desktop\ADM_HW3\inverted_indx.txt'
path_inv_indx_tfid = r'C:\Users\alice\Desktop\ADM_HW3\inverted_indx_tfid.txt'

#  Create documents

In [23]:
file_ = pd.read_csv('Airbnb_Texas_Rentals.csv')

In [24]:
unique_file = file_.drop_duplicates(['description', 'title'])
unique_len = len(unique_file)
print(unique_len)

11517


In [62]:
unique_csv = unique_file.to_csv('filtered_dataframe.csv', sep = '\t')

Creating the .tsv documents:

In [63]:
with open('filtered_dataframe.csv', 'r', encoding = "utf8") as all_rev:
    csv_reader = csv.reader(all_rev)
    ind = -1
    for row in all_rev:
        ind += 1
        if ind == 0:
            continue
        # skipping the empty lines
        elif row == '\n':
            ind -= 1
            # we manipulate 'ind' in order to have files named doc_i where i = 0,1,2,3,..
        else:
            # we store the documents inside a new folder
            new_title = r'file_unique\doc_' + str(ind-1) + '.tsv'
            fil_ = open(new_title, 'w', encoding = "utf8")
            csv_writer = csv.writer(fil_, delimiter ='\t')
            # we skip the first column
            row1 = row.split("\t")[2:]
            csv_writer.writerow(row1)
            fil_.close()

The **preprocess** function converts words in files.
* removing '\n'
* removing punctuation
* filter the non stopwords
* removing the stem

In [64]:
def preprocess(text):
    text = text.lower()
    # removing '\n'
    text = text.replace('\\n', ' ')
    # removing punctuation
    tokenizer = regexp_tokenize(text, "[\w\$]+")
    # filter the non stopwords
    filtered = [w for w in tokenizer if not w in stopwords.words('english')]
    #words = set(nltk.corpus.words.words())
    #filtered1 = [w for w in filtered if w in words]
    ps = PorterStemmer()
    # removing the stem
    filtered = [ps.stem(word) for word in filtered]
    return filtered

In [65]:
len_file = len([x for x in os.scandir(path_all_files)])
len_file

11517

In [66]:
start = datetime.datetime.now()

vocabulary_set = set()
docs_list = []

for i in range(len_file - 1):
    with open(path_all_files_doc + str(i) + '.tsv', 'r', encoding = 'utf8') as csvfile:
        file1 = csv.reader(csvfile, delimiter = '\t')
        columns = [i for i in file1]
        # we want to focus only on description and title
        description = columns[0][4]
        description = preprocess(description)
        title = columns[0][7]
        title = preprocess(title)
        tit_desc = title + description
        # creating the vocabulary
        vocabulary_set.update(tit_desc)
        docs_list.append(set(tit_desc))

print(datetime.datetime.now() - start)

0:03:45.081679


In [67]:
vocabulary = {k:v for v, k in enumerate(vocabulary_set)}

Saving vocabulary to the file 'vocabulary.txt'

In [68]:
voc_file = open(path_vocabulary, 'w', encoding = "utf8")
for term in vocabulary:
    voc_file.write('{0}\t{1}\n'.format(term, vocabulary[term]))

### Creating Inverted Index

In [69]:
inv_indx = defaultdict(set)
for idx, text_dict in enumerate(docs_list):
    for word in text_dict:
        id_word = vocabulary[word]
        inv_indx[id_word].add(idx)

Previously we operated on sets as values in dictionary. Now we want to have a list as the value.

In [70]:
for key, value in inv_indx.items():
    inv_indx[key] = list(value)

Saving inv_indx to the .txt file

In [71]:
inv_file = open(path_inverted_indx, 'w', encoding = "utf8")
for id_term in inv_indx:
    docks = inv_indx[id_term]
    d = '\t'.join(map(str, docks))
    inv_file.write('{0}\t{1}\n'.format(id_term, d))
inv_file.close()

In [72]:
global inv_indx_tfid
inv_indx_tfid = {} 
# inverted Index dictionary with the TFIDF scores

def computeTFIDF(freq_dict, doc_id, tot_num_docs, inv_indx):
    numWords = len(freq_dict)
    for word in freq_dict.keys():
        word_id = vocabulary[word]
        log_part = np.log(float(tot_num_docs)/len(inv_indx[word_id]))
        try:
            inv_indx_tfid[word_id].append((doc_id, round(float(freq_dict[word])/numWords * log_part, 3)))
        except:
            inv_indx_tfid[word_id] = [(doc_id, round(float(freq_dict[word])/numWords * log_part, 3))]
    return 

The **occurenceNum** function calculates the words frequency in one document and calls **computeTFIDF** to compute the TFID score.

ARGS:
* **index** - the document id number

In [73]:
def occurenceNum(index):
    with open(path_all_files_doc + str(index) + '.tsv', 'r', encoding = "utf8") as doc:
        file = csv.reader(doc, delimiter = '\t')
        columns = [i for i in file]
        description = columns[0][4]
        description = preprocess(description)
        title = columns[0][7]
        title = preprocess(title)
        tit_desc = title + description
        freq_dict = {}
        for w in tit_desc:
            try:
                freq_dict[w] += 1
            except:
                freq_dict[w] = 1
        return (freq_dict, index)

When you call the **occurenceNum** function inside the loop, the dictionary 'inv_indx_tfid' is created at the same time:

In [76]:
start = datetime.datetime.now()        

for file in range(0, len_file-1):
    (freq_dict, index) = occurenceNum(file)
    computeTFIDF(freq_dict, index, len_file, inv_indx)
    
print(datetime.datetime.now() - start)

0:03:50.687818


Saving **the Inverted Index file with TFIDF score** (inverted_indx_tfid.txt):

In [77]:
inv_file_score = open(path_inv_indx_tfid, 'w', encoding = "utf8")
for id_term, docks in inv_indx_tfid.items():
    d = '\t'.join(map(str, docks))
    inv_file_score.write('{0}\t{1}\n'.format(id_term, d))
inv_file_score.close()

In [78]:
#wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [79]:
#wordlist

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aardvark',
 'aardwolf',
 'aba',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'abaptiston',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'abastardize',
 'abatable',
 'abate',
 'abatement',
 'abater',
 'abatis',
 'abatised',
 'abaton',
 'abator',
 'abattoir',
 'abature',
 'abave',
 'abaxial',
 'abaxile',
 'abaze',
 'abb',
 'abbacomes',
 'abbacy',
 'abbas',
 'abbasi',
 'abbassi',
