# Imports

In [1]:
# Basics
import numpy as np
import pandas as pd
import re
import requests

# Speciality
import unicodedata
import json
from time import strftime
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# Custom
import acquire

# Exercise 1

**Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:**

   - Lowercase everything
   - Normalize unicode characters
   - Replace anything that is not a letter, number, whitespace or a single quote.



In [2]:
# Sample String
og_string = '''One evening as the sun went down
And the jungle fire was burning
Down the track came a hobo hiking
And he said, "Boys, I'm not turning"
"I'm headed for a land that's far away
Besides the crystal fountains
So come with me, we'll go and see
The Big Rock Candy Mountains"
'''
og_string.lower()

'one evening as the sun went down\nand the jungle fire was burning\ndown the track came a hobo hiking\nand he said, "boys, i\'m not turning"\n"i\'m headed for a land that\'s far away\nbesides the crystal fountains\nso come with me, we\'ll go and see\nthe big rock candy mountains"\n'

In [3]:
# Unicodedata.normalize
string = unicodedata.normalize('NFKD', og_string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
string

'One evening as the sun went down\nAnd the jungle fire was burning\nDown the track came a hobo hiking\nAnd he said, "Boys, I\'m not turning"\n"I\'m headed for a land that\'s far away\nBesides the crystal fountains\nSo come with me, we\'ll go and see\nThe Big Rock Candy Mountains"\n'

In [4]:
# Replace anything that is not a letter, number, whitespace or a single quote
re.sub(r'[^\w\s]', '', string).lower()

'one evening as the sun went down\nand the jungle fire was burning\ndown the track came a hobo hiking\nand he said boys im not turning\nim headed for a land thats far away\nbesides the crystal fountains\nso come with me well go and see\nthe big rock candy mountains\n'

In [5]:
# Build the function
def basic_clean(string):
    '''
    Takes in any string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', og_string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

# Exercise 2
**Define a function named tokenize. It should take in a string and tokenize all the words in the string.**

In [6]:
# Make the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()
# Use it
print(tokenizer.tokenize(og_string, return_str=True))

One evening as the sun went down
And the jungle fire was burning
Down the track came a hobo hiking
And he said , " Boys , I ' m not turning " 
 " I ' m headed for a land that ' s far away
Besides the crystal fountains
So come with me , we ' ll go and see
The Big Rock Candy Mountains "


In [7]:
# Build the function
def tokenize(string):
    '''
    Takes in a string and
    returns a tokenized string
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    return string

# Exercise 3
**Define a function named stem. It should accept some text and return the text after applying stemming to all the words.**

In [8]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

# Exercise 4
**Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.**

In [9]:
# Create the lemmatizer
wnl = nltk.stem.WordNetLemmatizer()

In [10]:
# Use the lemmatizer on each word in the list of words we created by using split
lemmas = [wnl.lemmatize(word) for word in string.split()]
    
# Join our list of words into a string again and assign to a variable
string = ' '.join(lemmas)

In [11]:
# put it together


def lemmatize(string):
    '''
    Takes in string and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

# Exercise 5
**Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.**

**This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.**

In [12]:

def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

# Exercise 6
**Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.**

In [14]:
news_df = acquire.get_inshorts_articles()




  soup = BeautifulSoup(response.text)


In [15]:
news_df.head()

Unnamed: 0,title,author,content,date,category
0,Apple delays plan requiring employees to come ...,Pragya Swastik,Apple has delayed its plan that required its e...,"18 May 2022,Wednesday",business
1,Wheat shouldn't go the way of COVID-19 vaccine...,Apaar Sharma,"Calling out the West, India said that wheat sh...","19 May 2022,Thursday",business
2,"Price of domestic LPG cylinder crosses ₹1,000-...",Apaar Sharma,The price of a 14.2-kg domestic LPG cylinder w...,"19 May 2022,Thursday",business
3,Rupee closes at a new all-time low of 77.58 ag...,Anmol Sharma,The Indian rupee closed at a new all-time low ...,"18 May 2022,Wednesday",business
4,"Target's shares crash 26%, on track for their ...",Pragya Swastik,The shares of American department store chain ...,"18 May 2022,Wednesday",business


# Exercise 7
**Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.**

In [16]:
codeup_df = acquire.get_blog_articles()



  soup = BeautifulSoup(response.text)


  soup = BeautifulSoup(response.text)


In [17]:
codeup_df.head()

Unnamed: 0,title,published,content
0,Project Quest Info Session: IT Jumpstart on Ma...,"May 11, 2022",Join our grant partner Project Quest as they d...
1,From Bootcamp to Bootcamp | A Military Appreci...,"Apr 27, 2022","In honor of Military Appreciation Month, join ..."
2,Our Acquisition of the Rackspace Cloud Academy...,"Apr 14, 2022","Just about a year ago on April 16th, 2021 we a..."
3,Learn to Code: HTML & CSS on 4/30,"Apr 1, 2022",HTML & CSS are the design building blocks of a...
4,Learn to Code: Python Workshop on 4/23,"Mar 31, 2022","According to LinkedIn, the “#1 Most Promising ..."


# Exercise 8
**For each dataframe, produce the following columns:**

   - title to hold the title
   - original to hold the original article/post content
   - clean to hold the normalized and tokenized original with the stopwords removed.
   - stemmed to hold the stemmed version of the cleaned data.
   - lemmatized to hold the lemmatized version of the cleaned data.

In [21]:
# Set up original column
news_df.rename(columns={'content': 'original'}, inplace=True)
codeup_df.rename(columns={'content': 'original'}, inplace=True)

In [22]:
# Function to clean the articles
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [23]:
# use the function defined above for news_df's content column.

prep_article_data(news_df, 'original', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Apple delays plan requiring employees to come ...,Apple has delayed its plan that required its e...,one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...
1,Wheat shouldn't go the way of COVID-19 vaccine...,"Calling out the West, India said that wheat sh...",one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...
2,"Price of domestic LPG cylinder crosses ₹1,000-...",The price of a 14.2-kg domestic LPG cylinder w...,one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...
3,Rupee closes at a new all-time low of 77.58 ag...,The Indian rupee closed at a new all-time low ...,one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...
4,"Target's shares crash 26%, on track for their ...",The shares of American department store chain ...,one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...


In [24]:
# use the function defined above for codeup_df's content column.

prep_article_data(codeup_df, 'original', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Project Quest Info Session: IT Jumpstart on Ma...,Join our grant partner Project Quest as they d...,one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...
1,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ...",one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...
2,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a...",one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...
3,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...,one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...
4,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ...",one evening sun went jungle fire burning track...,one even sun went jungl fire burn track came h...,one evening sun went jungle fire burning track...


# Exercise 9

**Ask yourself:**
    
   - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
       - Lemmatized
   - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
       - Lemmatized
   - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
       - Lemmatized

