In [None]:
# data
import os

from nltk.corpus import stopwords

with open('config', mode='r', encoding='utf-8') as cfile:
    conf = cfile.readlines()
    path_root = conf[0][len('path_root='):].strip()
    dir_source = conf[1][len('dir_source='):].strip()
    dir_temp = conf[2][len('dir_temp='):].strip()  # folder name for the processed files
    fn_indexer = conf[3][len('fn_indexer='):].strip()    # new file to store the indexer
    fn_doc_id = conf[4][len('fn_doc_id='):].strip()
    fn_doc_list = conf[5][len('fn_doc_list='):].strip()
    fn_doc_len = conf[6][len('fn_doc_len='):].strip()


path_source = os.path.join(path_root, dir_source)
path_temp = os.path.join(path_root, dir_temp)
doc_id = {}
doc_list = []
doc_len = {}
indexer = {}

stop_words = {'i', 'me', 'my', 'myself', 'we',
              'our', 'ours', 'ourselves', 'you',
              "you're", "you've", "you'll", "you'd",
              'your', 'yours', 'yourself', 'yourselves',
              'he', 'him', 'his', 'himself', 'she',
              "she's", 'her', 'hers', 'herself', 'it',
              "it's", 'its', 'itself', 'they', 'them',
              'their', 'theirs', 'themselves', 'what',
              'which', 'who', 'whom', 'this', 'that',
              "that'll", 'these', 'those', 'am', 'is',
              'are', 'was', 'were', 'be', 'been',
              'being', 'have', 'has', 'had', 'having',
              'do', 'does', 'did', 'doing', 'a', 'an',
              'the', 'and', 'but', 'if', 'or', 'because',
              'as', 'until', 'while', 'of', 'at', 'by',
              'for', 'with', 'about', 'against', 'between',
              'into', 'through', 'during', 'before', 'after',
              'above', 'below', 'to', 'from', 'up', 'down',
              'in', 'out', 'on', 'off', 'over', 'under',
              'again', 'further', 'then', 'once', 'here',
              'there', 'when', 'where', 'why', 'how',
              'all', 'any', 'both', 'each', 'few',
              'more', 'most', 'other', 'some', 'such',
              'no', 'nor', 'not', 'only', 'own',
              'same', 'so', 'than', 'too', 'very',
              's', 't', 'can', 'will', 'just', 'don',
              "don't", 'should', "should've", 'now', 'd',
              'll', 'm', 'o', 're', 've', 'y', 'ain',
              'aren', "aren't", 'couldn', "couldn't",
              'didn', "didn't", 'doesn', "doesn't", 'hadn',
              "hadn't", 'hasn', "hasn't", 'haven', "haven't",
              'isn', "isn't", 'ma', 'mightn', "mightn't",
              'mustn', "mustn't", 'needn', "needn't", 'shan',
              "shan't", 'shouldn', "shouldn't", 'wasn',
              "wasn't", 'weren', "weren't", 'won', "won't",
              'wouldn', "wouldn't"}
# stop_words = set(stopwords.words('english'))
stop_words.add('s')


In [None]:
# Some functions
import csv
from math import log10

from nltk import RegexpTokenizer

# tokenize

def get_term_freq(textlines):
    title = textlines[0]
    text = textlines[7]
    title = title.lower()
    text = text.lower()
    # word_tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
    word_tokenizer = RegexpTokenizer('[A-Za-z]+')
    words_title = word_tokenizer.tokenize(title)
    term_set = set(words_title).difference(stop_words)
    term_freq = {}
    for w in words_title:
        if w in term_set:
            if w in term_freq:
                term_freq[w] = term_freq[w] + 1
            else:
                term_freq[w] = 1

    words_text = word_tokenizer.tokenize(text)
    term_set = set(words_text).difference(stop_words)
    for w in words_text:
        if w in term_set:
            if w in term_freq:
                term_freq[w] = term_freq[w] + 5
            else:
                term_freq[w] = 10
    return term_freq, len(words_title) + len(words_text) * 5


# store sub dictionary in .csv

In [None]:
def process_directions():
    indexer.clear()
    doc_list.clear()
    doc_id.clear()
    doc_len.clear()
    if not os.path.exists(path_temp):
        os.mkdir(path_temp)
    for root, dirs, files in os.walk(path_source):
        for file_name in files:
            fpn = os.path.join(root, file_name)
            doc_list.append(fpn)
            file_cnt = len(doc_list) - 1
            doc_id[fpn] = file_cnt

            with open(fpn, mode='r', errors='ignore', encoding='utf-8') as f:
                text = f.readlines()
            term_freq, d_len = get_term_freq(text)
            doc_len[file_cnt] = d_len
            for term, value in term_freq.items():
                if term in indexer:
                    indexer[term].append((file_cnt, value))
                else:
                    indexer[term] = [(file_cnt, value)]
            if file_cnt % 1000 == 0:
                print(file_cnt)
    return indexer


def dump():
    with open(os.path.join(path_temp, fn_indexer), mode='w+', newline='', encoding='utf-8') as f:
        f_writer = csv.writer(f)
        for key, value in indexer.items():
            f_writer.writerow([key] + value)
    with open(os.path.join(path_temp, fn_doc_id), mode='w+', newline='', encoding='utf-8') as f:
        f_writer = csv.writer(f)
        f_writer.writerows(doc_id.items())
    with open(os.path.join(path_temp, fn_doc_len), mode='w+', newline='', encoding='utf-8') as f:
        f_writer = csv.writer(f)
        f_writer.writerows(doc_len.items())
    with open(os.path.join(path_temp, fn_doc_list), mode='w+', newline='', encoding='utf-8') as f:
        f_writer = csv.writer(f)
        f_writer.writerow(doc_list)
    print('dump fin')


In [None]:
process_directions()
print('fin')

In [None]:
dump()
