In [1]:
def is_string(string):
    return isinstance(string, str)

def is_int(num):
    return isinstance(num, int)

def is_positive_int(num):
    return is_int(num) and num >=0

In [9]:
import re
import pickle

def count_bigrams_in_file(file_name):
    '''
        Function creates a bigram of words from selected file and returns it as a dict where keys are bigram,
        and values are its appearance in the text. The output is saved to a pickle file.
    
        input:
            file_name: name of a given file
        output:
            dict_bigram: dictionary of bigram words
    '''
    
    # check input
    if not (is_string(file_name)) :
        raise TypeError('Input file should be in string format')
        
    dict_bigram = dict()
    prev_word = ""
    
    # open file
    try:
        file_read = open(file_name)
    except:
        raise FileExistsError("File " + file_name + " does not exist")
        
    for line in file_read:
        # split string to get list of words
        preprocessed_line = re.sub(r'[^\w\s]', '', line.lower()).split()
        
        # if no elements left after preprocessing, continue
        if len(preprocessed_line) == 0:
            continue
        
        # check the first word with prev. word
        if prev_word != "":
            # get a tuple
            elem = (prev_word, preprocessed_line[0])

            if dict_bigram.get(elem) == None:
                # if a new bigram, add it to dict
                dict_bigram[elem] = 1
            else:
                # if exists, add 1
                dict_bigram[elem] += 1
        prev_word = preprocessed_line[-1]

        # walk through the line
        for idx in range(len(preprocessed_line) - 1):
            # get 2 close words as a tuple
            elem = (preprocessed_line[idx],  preprocessed_line[idx + 1])

            if dict_bigram.get(elem) == None:
                # if a new bigram, add it to dict
                dict_bigram[elem] = 1
            else:
                # if exists, add 1
                dict_bigram[elem] += 1
        
    file_read.close()
    
    with open('Bigrams.pickle', 'wb') as handle:
        pickle.dump(dict_bigram, handle, protocol=0)


In [10]:
count_bigrams_in_file('text_template.txt')

with open('Bigrams.pickle', 'rb') as handle:
    dic_pickled = pickle.load(handle)
    
print(dic_pickled)

