## Simple functions 

### Tokenization

** way 1  **

In [2]:
def word_count(string):
    # Return a dict of the counts of the words in the given string
    
    words = dict()
    for word in string.split():
        
        if word in words:
            count = words[word]
            count = count + 1
            words[word] = count
        else:
            words[word] = 1
    return words

In [3]:
print(word_count("Carl knows Jason and Jason knows Carl"))

{'Carl': 2, 'knows': 2, 'Jason': 2, 'and': 1}


** way 2 **

In [5]:
def word_count1(string):
    # Return a dict of the counts of the words in the given string
    
    words = dict()
    for word in string.split():
        
        words[word] = words.get(word,0)+1

    return words

In [12]:
print(word_count1("Carl knows Jason, and Jason knows Carl"))

{'carl': 2, 'knows': 2, 'jason': 2, 'and': 1}


***

### Clean the punctuation

In [9]:
def clean(word):
    # Return the word in the given string with all none punctuation
    # and non-letters removed and all letters lowercase
    
    word = word.lower()
    chars = [c for c in word if c >= "a"
            and c <= "z"]
    
    return "".join(chars)


def word_count1(string):
    # Return a dict of the counts of the words in the given string
    
    words = dict()
    for word in string.split():
        
        word = clean(word)
        words[word] = words.get(word,0)+1

    return words

In [11]:
print(word_count1("Carl knows Jason, and Jason knows Carl"))

{'carl': 2, 'knows': 2, 'jason': 2, 'and': 1}


***

### Improved word_counts so it can handle multi-line strings

In [13]:
def clean(word):
    # Return the word in the given string with all none punctuation
    # and non-letters removed and all letters lowercase
    
    word = word.lower()
    chars = [c for c in word if c >= "a"
            and c <= "z"]
    
    return "".join(chars)


def word_count1(string):
    # Return a dict of the counts of the words in the given string
    
    words = dict()
    for line in string.split("\n"):
        for word in line.split():
            word = clean(word)
            words[word] = words.get(word,0)+1

    return words

In [14]:
print(word_count1("Carl knows Jason, and Jason knows Carl \n and Amanda knows both Jason and Carl"))

{'carl': 3, 'knows': 3, 'jason': 3, 'and': 3, 'amanda': 1, 'both': 1}


***

### Improved to ignore the common words ["and", "a", "an", "the"]

In [15]:
def clean(word):
    # Return the word in the given string with all none punctuation
    # and non-letters removed and all letters lowercase
    
    word = word.lower()
    chars = [c for c in word if c >= "a"
            and c <= "z"]
    
    return "".join(chars)


def word_count1(string):
    # Return a dict of the counts of the words in the given string
    
    words = dict()
    for line in string.split("\n"):
        for word in line.split():
            word = clean(word)
            words[word] = words.get(word,0)+1

    return words

In [16]:
email = "Carl knows Jason, and Jason knows Carl \n and Amanda knows both Jason and Carl"

In [21]:
words = word_count1(email)
words

{'amanda': 1, 'and': 3, 'both': 1, 'carl': 3, 'jason': 3, 'knows': 3}

In [22]:
for word in ["and", "a", "an", "the"]:
    words.pop(word, None)
print(words)

{'carl': 3, 'knows': 3, 'jason': 3, 'amanda': 1, 'both': 1}
