## Counting Words

In [None]:
%%writefile input.txt

As I was waiting, a man came out of a side room, and at a glance I was sure he must be Long John. His left leg was cut off close by the hip, and under the left shoulder he carried a crutch, which he managed with wonderful dexterity, hopping about upon it like a bird. He was very tall and strong, with a face as big as a ham—plain and pale, but intelligent and smiling. Indeed, he seemed in the most cheerful spirits, whistling as he moved about among the tables, with a merry word or a slap on the shoulder for the more favoured of his guests.

In [None]:
with open("input.txt", "r") as f:
        text = f.read()
        text= text.lower()
        print(text)
        
        import re
        pattern= "[^a-zA-Z0-9]"
        text= re.split(pattern, text)
        print(text)
        
        

In [None]:
"""Count words."""

def count_words(text):
    """Count how many times each unique word occurs in text."""
    counts = dict()  # dictionary of { <word>: <count> } pairs to return
    
    # TODO: Convert to lowercase
    text= text.lower()
    
    # TODO: Split text into tokens (words), leaving out punctuation
    # (Hint: Use regex to split on non-alphanumeric characters)
    import re
    pattern= "[^a-z0-9]"
    text= re.split(pattern, text)
    # remove pesky empty strings
    text= filter(None, text)
    
    # TODO: Aggregate word counts using a dictionary
    import collections
    counts= collections.Counter()
    counts.update(text)
    
    return counts


def test_run(my_input_text):
    with open(my_input_text, "r") as f:
        text = f.read()
        counts = count_words(text)
        sorted_counts = sorted(counts.items(), key=lambda pair: pair[1], reverse=True)
        
        print("10 most common words:\nWord\tCount")
        for word, count in sorted_counts[:10]:
            print("{}\t{}".format(word, count))
        
        print("\n10 least common words:\nWord\tCount")
        for word, count in sorted_counts[-10:]:
            print("{}\t{}".format(word, count))


if __name__ == "__main__":
    test_run("input.txt")


In [None]:
%%writefile test_case2.txt
Buffalo buffalo Buffalo, buffalo buffalo!

In [None]:
my_string= "Buffalo buffalo Buffalo, buffalo buffalo!"

In [None]:
my_string= my_string.lower()
my_string

In [None]:
pattern= "[^a-z0-9]"
my_string= re.search(pattern, my_string)
my_string.groups()

In [None]:
test_run("test_case2.txt")

## Read Text Files

In [None]:
"""Reading in text files."""

def read_file(filename):
    """Read a plain text file and return the contents as a string."""
    with open(filename, 'r') as f:
        text= f.read()
        return text


def read_files(path):
    """Read all files that match given path and return a dict with their contents."""
    import glob
    import os
    
    #import pdb;pdb.set_trace()
    # Get a list of all files and filenames
    file_list= glob.glob(path)   
    filenames= [os.path.basename(file_path) for file_path in file_list]
    
    
    # Read each file using read_file()  and place assaemble dict of form { <filename>: <contents> }
    contents= [read_file(file_path) for file_path in file_list]
    
    contents_dict= dict(zip(filenames, contents))
    
    return contents_dict


def test_run():
    # Test read_file()
    print(read_file("data/hieroglyph.txt"))
    
    # Test read_files()
    texts = read_files("data/*.txt")
    for name in texts:
        print("\n***", name, "***")
        print(texts[name])


if __name__ == '__main__':
    test_run()


## Bigram Model

In [91]:
# %load bigram.py
"""Bigram Model."""

import os
import re
import random
import collections


def read_file(filename):
    """Read a plain text file and return the contents as a string."""
    with open(filename, 'r') as f:
        text= f.read()
        return text

    

def compute_bigram_model(path, files):
    """Compute a bigram model for a given corpus, including unigram probabilities.

    Params
    ======
        path: directory where input files are located
        files: list of files, or a single string specifying regex pattern to match (e.g. r'.*\.txt')

    Returns
    =======
        p_unigrams: dict with frequency of single words (need not be normalized to [0, 1])
        p_bigrams: dict of dicts with frequency of bigrams (need not be normalized to [0, 1])

    """

    # Grab a list of all files in specified corpus
    import pdb;pdb.set_trace()
    if isinstance(files, str):
        files = [f for f in os.listdir(path) if re.match(files, f)]  # collect all matching filenames
    files = [os.path.join(path, f) for f in files]  # prepend path to each filename

    # Read in text from each file and combine into a single string
    filenames= [os.path.basename(file_path) for file_path in files]
    contents= [read_file(file_path) for file_path in files]    
    contents_dict= dict(zip(filenames, contents))    
    

    # Clean and tokenize text (note that you may want to retain case and sentence delimiters)
    # Remove punctuation characters
    import pdb;pdb.set_trace()
    for key, val in contents_dict.items():
        contents_dict[key]= re.sub(r"[^a-zA-Z0-9]", " ", val).split()
        words= contents_dict[key]
        unigram_counts= collections.Counter()
        unigram_counts.update(words)
        total_counts= float(sum(unigram_counts.values()))
        bigrams= collections.defaultdict(list)
        
        
        # unigram probabilities       
        #unigrams= [words[i] for i in range(len(words) - 1)]
        for key, val in unigram_counts.items():
            unigram_counts[key]
            unigram_vals= unigram_counts.values()
            unigram_elements= unigram_counts.elements()
            unigram_probs= [value/total_counts for value in unigram_vals]
        p_unigrams= dict(zip(unigram_elements, unigram_probs))
        
        # assemble bigrams dictionary
        for i in range(len(words) - 1):
            current_word= words[i]
            next_word= words[i + 1]
            bigrams[current_word].append(next_word)
            
        # convert to probabilities
        p_bigrams= dict()
        for word in bigrams.keys():
            bigram_counts= collections.Counter()
            bigram_counts.update(bigrams[word])
            bigram_elements= bigram_counts.elements()
            bigram_values= bigram_counts.values()
            word_sum= float(sum(bigram_values))
            bigram_probs= [value/word_sum for value in bigram_values]
            p_bigrams[word]= dict(zip(bigram_elements, bigram_probs))
            
            

    return p_unigrams, p_bigrams

In [89]:
def generate_sequence(p_unigrams, p_bigrams, num_words=100, seed_word=None):
    """Generate a random sequence of words, given unigram and bigram probabilities."""

    # If seed_word is not given, pick one randomly based on unigram probabilities
    if seed_word is None:
        seed_word = random.choices(list(p_unigrams.keys()), weights=list(p_unigrams.values()))[0]
    seq = [seed_word]
    for i in range(num_words):
        seq.append(random.choices(list(p_bigrams[seq[-1]].keys()), weights=list(p_bigrams[seq[-1]].values()))[0])
    return seq

In [90]:
p_uni, p_bi= compute_bigram_model(path='.', files=['carroll-alice.txt'])

> <ipython-input-88-0cada4a6daf4>(35)compute_bigram_model()
-> if isinstance(files, str):
(Pdb) c
> <ipython-input-88-0cada4a6daf4>(48)compute_bigram_model()
-> for key, val in contents_dict.items():
(Pdb) c


In [None]:
def test_run():
    # Compute bigram model
        p_unigrams, p_bigrams = compute_bigram_model(path='.', files=['carroll-alice.txt'])

    # Check most common unigrams (single words)
    print("10 most common unigrams:")
    sorted_unigrams = sorted(p_unigrams.items(), key=lambda item: item[1], reverse=True)  # each item = (i, count)
    for word, count in sorted_unigrams[:10]:
        print("{}\t{}".format(word, count))

    # Check most common bigrams (pairs of words)
    all_bigrams = [(i, j, count) for i in p_bigrams.keys() for j, count in p_bigrams[i].items()]
    sorted_bigrams = sorted(all_bigrams, key=lambda item: item[2], reverse=True)  # each item = (i, j, count)
    print("10 most common bigrams:")
    for i, j, count in sorted_bigrams[:10]:
        print("{}\t{}\t{}".format(i, j, count))

    # Generate a sample sequence of words
    seq = generate_sequence(p_unigrams, p_bigrams, seed_word="Alice")
    print(" ".join(seq))


if __name__ == "__main__":
    test_run()

## Example from Mentor

In [8]:
from collections import defaultdict


corpus = "the cat is red the cat is green the cat is blue the dog is red"

tokenized_string = corpus.split()
previous_word = ""
dictionary = defaultdict(list)

# Step 1: build Bigram dictionary
for current_word in tokenized_string:
  if previous_word != "":
    dictionary[previous_word].append(current_word)
  previous_word = current_word

print(tokenized_string)
print(dictionary)

# Step 2: compute conditional probability
for key in dictionary.keys():
  next_words = dictionary[key]
  unique_words = set(next_words)  # removes duplicated
  nb_words = len(next_words)
  cond_prob = {}
  for unique_word in unique_words:
    cond_prob[unique_word] = float(next_words.count(unique_word)) / nb_words
  dictionary[key] = cond_prob

print(dictionary)

['the', 'cat', 'is', 'red', 'the', 'cat', 'is', 'green', 'the', 'cat', 'is', 'blue', 'the', 'dog', 'is', 'red']
defaultdict(<type 'list'>, {'blue': ['the'], 'is': ['red', 'green', 'blue', 'red'], 'dog': ['is'], 'cat': ['is', 'is', 'is'], 'green': ['the'], 'the': ['cat', 'cat', 'cat', 'dog'], 'red': ['the']})
defaultdict(<type 'list'>, {'blue': {'the': 1.0}, 'is': {'blue': 0.25, 'green': 0.25, 'red': 0.5}, 'dog': {'is': 1.0}, 'cat': {'is': 1.0}, 'green': {'the': 1.0}, 'the': {'dog': 0.25, 'cat': 0.75}, 'red': {'the': 1.0}})


In [55]:
from collections import defaultdict


corpus = "the cat is red the cat is green the cat is blue the dog is red"

words = corpus.split()
previous_word = ""
bigrams = defaultdict(list)

# Step 1: build Bigram dictionary
#bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]

for i in range(len(words) - 1):
    current_word= words[i]
    next_word= words[i + 1]
    bigrams[current_word].append(next_word)
    
# Step 2: compute conditional probability
p_bigrams= dict()
for word in bigrams.keys():
    counts= collections.Counter()
    counts.update(bigrams[word])
    elements= counts.elements()
    values= counts.values()
    word_sum= float(sum(values))
    probs= [value/word_sum for value in values]
    p_bigrams[word]= dict(zip(elements, probs))



print(words)
print(bigrams)
print(p_bigrams)

['the', 'cat', 'is', 'red', 'the', 'cat', 'is', 'green', 'the', 'cat', 'is', 'blue', 'the', 'dog', 'is', 'red']
defaultdict(<type 'list'>, {'blue': ['the'], 'is': ['red', 'green', 'blue', 'red'], 'dog': ['is'], 'cat': ['is', 'is', 'is'], 'green': ['the'], 'the': ['cat', 'cat', 'cat', 'dog'], 'red': ['the']})
{'blue': {'the': 1.0}, 'is': {'blue': 0.25, 'green': 0.25, 'red': 0.5}, 'dog': {'is': 1.0}, 'cat': {'is': 1.0}, 'green': {'the': 1.0}, 'the': {'dog': 0.25, 'cat': 0.75}, 'red': {'the': 1.0}}
