# Imports

In [5]:
# Basics
import numpy as np
import pandas as pd
import re
import requests

# 
import unicodedata
import json
from time import strftime
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# Custom
import acquire

# Exercise 1

**Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:**

   - Lowercase everything
   - Normalize unicode characters
   - Replace anything that is not a letter, number, whitespace or a single quote.



In [13]:
# Sample String
og_string = '''One evening as the sun went down
And the jungle fire was burning
Down the track came a hobo hiking
And he said, "Boys, I'm not turning"
"I'm headed for a land that's far away
Besides the crystal fountains
So come with me, we'll go and see
The Big Rock Candy Mountains"
'''
og_string.lower()

'one evening as the sun went down\nand the jungle fire was burning\ndown the track came a hobo hiking\nand he said, "boys, i\'m not turning"\n"i\'m headed for a land that\'s far away\nbesides the crystal fountains\nso come with me, we\'ll go and see\nthe big rock candy mountains"\n'

In [14]:
# Unicodedata.normalize
string = unicodedata.normalize('NFKD', og_string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
string

'One evening as the sun went down\nAnd the jungle fire was burning\nDown the track came a hobo hiking\nAnd he said, "Boys, I\'m not turning"\n"I\'m headed for a land that\'s far away\nBesides the crystal fountains\nSo come with me, we\'ll go and see\nThe Big Rock Candy Mountains"\n'

In [15]:
# Replace anything that is not a letter, number, whitespace or a single quote
re.sub(r'[^\w\s]', '', string).lower()

'one evening as the sun went down\nand the jungle fire was burning\ndown the track came a hobo hiking\nand he said boys im not turning\nim headed for a land thats far away\nbesides the crystal fountains\nso come with me well go and see\nthe big rock candy mountains\n'

In [None]:
# Build the function
def basic_clean(string):
    '''
    Takes in any string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', og_string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

# Exercise 2
**Define a function named tokenize. It should take in a string and tokenize all the words in the string.**

In [16]:
# Make the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()
# Use it
print(tokenizer.tokenize(og_string, return_str=True))

One evening as the sun went down
And the jungle fire was burning
Down the track came a hobo hiking
And he said , " Boys , I ' m not turning " 
 " I ' m headed for a land that ' s far away
Besides the crystal fountains
So come with me , we ' ll go and see
The Big Rock Candy Mountains "


In [None]:
# Build the function
def tokenize(string):
    '''
    Takes in a string and
    returns a tokenized string
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    return string

# Exercise 3
**Define a function named stem. It should accept some text and return the text after applying stemming to all the words.**

In [None]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

# Exercise 4
**Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.**

In [None]:
def lemmatize(string):
    '''
    Takes in string and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

# Exercise 5
**Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.**

**This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.**

# Exercise 6
**Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.**

# Exercise 7
**Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.**

# Exercise 8
**For each dataframe, produce the following columns:**

   - title to hold the title
   - original to hold the original article/post content
   - clean to hold the normalized and tokenized original with the stopwords removed.
   - stemmed to hold the stemmed version of the cleaned data.
   - lemmatized to hold the lemmatized version of the cleaned data.

# Exercise 9

**Ask yourself:**
    
   - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
   - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
   - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

