In [1]:
import nltk
import re
from collections import Counter
from itertools import islice
import io

## split the train file into smaller ones

In [2]:
def splitfile(infilepath, chunksize):
    i = 1
    written = False
    with io.open(infilepath, encoding='utf-8') as infile:
        while True:
            with io.open(f"../data/train_v2-{i}.txt", encoding='utf-8', mode='w') as outfile:
                for line in (infile.readline() for _ in range(chunksize)):
                    outfile.write(line)
                written = bool(line)
            if not written:
                break
            i += 1

splitfile("../data/train_v2.txt", 700000)

## train corpus
### ngrams frequency

In [3]:
def get_windows(seq, n, need_sorted=False):
    it = iter(seq)
    result = list(islice(it, n))
    if len(result) == n:
        if need_sorted:
            result = sorted(result)
        yield tuple(result)
    for elem in it:
        result = result[1:] + [elem,]
        if need_sorted:
            result = sorted(result)
        yield tuple(result)

## shelve 

In [4]:
import shelve

In [None]:
def flush(local, remote):
    for key, value in local.items():
        key = repr(key)
        remote[key] = remote.get(key, 0) + value

    local.clear()
    remote.sync()

In [None]:
import time

In [None]:
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

with io.open('../data/train_v2.txt', encoding='utf-8', mode='rt') as file, \
        shelve.open('unigram_db', writeback=True) as unigram_db, \
        shelve.open('bigram_db', writeback=True) as bigram_db, \
        shelve.open('trigram_db', writeback=True) as trigram_db:

    for step, line in enumerate(file):

        if step % 10000 == 0:
            start = time.time()
            flush(unigram_counter, unigram_db)
            flush(bigram_counter, bigram_db)
            flush(trigram_counter, trigram_db)
            print(f"working on {step // 10000}kth line,\n count of unigram - {len(unigram_counter)}, \
            \n count of bigram - {len(bigram_counter)},\n count of trigram - {len(trigram_counter)}\n \
            time = {start-time.time()}", end='\r')

        comma_pos = line.find(',')
        sentence = line[comma_pos + 1:].replace('"', ' ').lower()
        sentence = nltk.word_tokenize(sentence)

        unigram_counter.update(sentence)
        bigram_counter.update(get_windows(sentence, 2))
        trigram_counter.update(get_windows(sentence, 3))

## Data base

In [7]:
from pymongo import MongoClient

In [8]:
d = {'string1' : 1, 'string2' : 2, 'string3' : 3}
cl = MongoClient('localhost', 50000)
db = cl['example_db']
example_collection = db['example-collection']
for key, value in d.items():
    example_collection.save({'key' : key, 'value' : value})
    
obj = example_collection.find_one({'key':'string1'})
print(obj['value'])

  


ServerSelectionTimeoutError: localhost:50000: [WinError 10061] Подключение не установлено, т.к. конечный компьютер отверг запрос на подключение

In [None]:
def flush(local, remote):
    for key, value in local.items():
        obj = remote.find_one({'key':key})
        r_value = obj['value'] if obj else 0
        remote.save({'key' : key, 'value' : r_value + value})

    local.clear()

In [None]:
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()
client = MongoClient()
db = client['ngram_db']
unigram_db = db['unigram-db']
bigram_db = db['bigram-db']
trigram_db = db['trigram-db']

with io.open('../data/train_v2.txt', encoding='utf-8', mode='rt') as file:

    for step, line in enumerate(file):

        if step % 10000 == 0:
            start = time.time()
            flush(unigram_counter, unigram_db)
            flush(bigram_counter, bigram_db)
            flush(trigram_counter, trigram_db)
            print(f"working on {step // 10000}kth line,\n count of unigram - {len(unigram_counter)}, \
            \n count of bigram - {len(bigram_counter)},\n count of trigram - {len(trigram_counter)}\n \
            time = {start-time.time()}", end='\r')

        comma_pos = line.find(',')
        sentence = line[comma_pos + 1:].replace('"', ' ').lower()
        sentence = nltk.word_tokenize(sentence)

        unigram_counter.update(sentence)
        bigram_counter.update(get_windows(sentence, 2))


## simple counting

In [None]:
with io.open('../data/train_v2.txt', encoding='utf-8', mode='rt') as file:
    step = 0
    for line in file:
        if not (step % 1000):
            print(f"working on {step // 1000}kth line,\n count of unigram - {len(unigram_counter)}, \
            \n count of bigram - {len(bigram_counter)},\n count of trigram - {len(trigram_counter)}", end='\r')
        comma_pos = line.find(',')
        sentence = re.sub('"', '', line[comma_pos+1:]).lower()
        sentence = nltk.word_tokenize(sentence)
        
#         for token in sentence:
#             unigram_counter[repr(token)] += 1
#         for bigram in list(get_windows(sentence, 2)):
#             bigram_counter[repr(bigram)] += 1    
#         for trigram in list(get_windows(sentence, 3)):
#             trigram_counter[repr(trigram)] += 1  
#         for unigram in sentence:
#             if not (unigram in unigram_counter):
#                 unigram_counter[unigram] = 0
#             unigram_counter[unigram] += 1
# #         print(unigram_seq)
# #         unigram_counter.update(sentence)
#         bigram_seq = [repr(bigram) for bigram in get_windows(sentence, 2)]
#         for bigram in bigram_seq:
#             if not (bigram in bigram_counter):
#                 bigram_counter[bigram] = 0
#             bigram_counter[bigram] += 1
#         print('f')
#         trigram_seq = [repr(trigram) for trigram in get_windows(sentence, 3)]
#         for trigram in trigram_seq:
#             if not (trigram in trigram_counter):
#                 trigram_counter[trigram] = 0
#             trigram_counter[trigram] += 1
#         trigram_counter.update(trigram_seq)
        step += 1

In [None]:
print(f'number of all unigram in train corpus: {len(unigram_counter)}')
unigram_counter.most_common(30)

In [None]:
print(f'number of all bigram in train corpus: {len(bigram_counter)}')
bigram_counter.most_common(30)

In [None]:
print(f'number of all trigram in train corpus: {len(trigram_counter)}')
trigram_counter.most_common(30)

##