In [18]:
def is_string(string):
    return isinstance(string, str)

def is_int(num):
    return isinstance(num, int)

def is_positive_int(num):
    return is_int(num) and num >=0

In [19]:
import re
import pickle

def collocations(file_name):
    '''
        Function creates a collocationss of words from selected file and returns it as a dict where keys are word,
        and values are words that arae collocated with this word.
    
        input:
            file_name: name of a given file
        output:
            dict_colloc: dictionary of collocated words
    '''
    
    # check input
    if not (is_string(file_name)) :
        raise TypeError('Input file should be in string format')
        
    dict_colloc = dict()
    prev_word = ""
    
    # open file
    try:
        file_read = open(file_name)
    except:
        raise FileExistsError("File " + file_name + " does not exist")
        
    for line in file_read:
        # split string to get list of words
        preprocessed_line = re.sub(r'[^\w\s]', '', line.lower()).split()
        
        # if no elements left after preprocessing, continue
        if len(preprocessed_line) == 0:
            continue
        
        # insert previous word to the beginning of a list
        if prev_word != "":
            # get a tuple
            preprocessed_line.insert(0, prev_word)
        prev_word = preprocessed_line[-1]
        
        # walk through the line
        for idx in range(len(preprocessed_line)):
            # memorize idexes of the left and right (if exist)
            idx_words_to_add = []
            
            # xxx WORD yyy
            if idx != 0:
                # check the word on the left (xxx)
                idx_words_to_add.append(idx - 1)
            if idx != len(preprocessed_line) - 1:
                # check the word on the left (yyy)
                idx_words_to_add.append(idx + 1)
            
            elem = preprocessed_line[idx]
            
            # go through words
            for elem_idx in idx_words_to_add:
                # get the word
                colloc_elem = preprocessed_line[elem_idx]
                
                if dict_colloc.get(elem) == None:
                    # if a new collocation, add it to dict
                    dict_colloc[elem] = set([colloc_elem])
                else:
                    # if exists, add a word
                    dict_colloc[elem].add(colloc_elem)
        
    file_read.close()
    
    with open('./support_files/Collocations.pickle', 'wb') as handle:
        pickle.dump(dict_colloc, handle, protocol=0)


In [20]:
collocations('./support_files/text_template.txt')

with open('./support_files/Collocations.pickle', 'rb') as handle:
    dict_pickled = pickle.load(handle)
    
# get list
list_of_items = list(dict_pickled.items())

# print 3 random items
for idx in [0, 300, 1000]:
    print(list_of_items[idx])
    print("\n\n")

('war', {'always', 'finnish', 'replied', 'popular', 'he', 'thats', 'let', 'also', 'count', 'and', 'needlessly', 'a', 'commit', 'declaring', 'half', 'desire', 'russian', 'one', 'footing', 'guerrilla', 'on', 'wretched', 'real', 'national', 'began', 'for', 'has', 'if', 'patriotic', 'council', 'of', 'napoleon', 'austroprussian', 'be', 'andrew', 'bolkonski', 'there', 'cannot', 'which', 'gigantic', 'says', 'went', 'was', 'crimesmake', 'are', 'old', 'news', 'with', 'no', 'we', 'greatest', 'five', 'kutuzovs', 'presented', 'now', 'anticipate', 'tactical', 'obviously', 'last', 'believe', 'between', 'in', 'how', 'less', 'waged', 'nor', 'said', 'is', 'should', 'they', 'new', 'unlike', 'all', 'that', 'carried', 'without', 'beyond', 'thing', 'terrible', 'reproaches', 'scythian', 'trophy', 'comrade', 'impending', 'recommenced', 'undoubtedly', 'or', 'even', 'fresh', 'latest', 'what', 'plan', 'the', 'neither', 'asked', 'conducting', 'added', 'turkish', 'appears', 'will', 'prince', 'not', 'want', 'would