In [1]:
import requests
import bs4
import pandas as pd
import os


import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import acquire as a

In [2]:
#download the first time
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/forrestmccrosky/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Exercises

The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### Acquire 

First lets get some text to work with

In [3]:
# make the http request and turn the response into a beautiful soup object

headers = headers = {'User-Agent': 'Codeup Data Science'}

response = requests.get('https://codeup.com/codeups-data-science-career-accelerator-is-here/', 
                       headers = headers)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')

In [4]:
## getting our article content into the article variable

article = soup.find('div', class_ = 'jupiterx-post-content')

original = article.text[:300] ## looking at our article content
string_function = article.text[:300] ## making function test variable

#### Question 1:

Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

 - Lowercase everything
 - Normalize unicode characters
 - Replace anything that is not a letter, number, whitespace or a single quote.

In [5]:
original ## now we have our sample string to perform the exercises with

'The rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in\xa0Glassdoor’s #1 Best Job in America.\nData Science is a'

In [6]:
string = original.lower() ## lowercasing all the a to z characters in string

string ## <-- quality assurance check

'the rumors are true! the time has arrived. codeup has officially opened applications to our new data science career accelerator, with only 25 seats available! this immersive program is one of a kind in san antonio, and will help you land a job in\xa0glassdoor’s #1 best job in america.\ndata science is a'

In [7]:
## normalizing the unicode characters

string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')

string ## <-- quality assurance check

'the rumors are true! the time has arrived. codeup has officially opened applications to our new data science career accelerator, with only 25 seats available! this immersive program is one of a kind in san antonio, and will help you land a job in glassdoors #1 best job in america.\ndata science is a'

In [8]:
string = re.sub(r"[^a-z0-9'\s]", '', string)

string ## <-- quality assurance check

'the rumors are true the time has arrived codeup has officially opened applications to our new data science career accelerator with only 25 seats available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in america\ndata science is a'

In [9]:
def basic_clean(string):
    '''
    This function takes in a string and performs some basic cleaning.
    
    Converts all alphabet characters to lowercase
    Normalize unicode characters
    recplace anything that is not a letter, number, whitespace or a single quote
    '''
    ## convert all alphabet characters to lowercase
    string = string.lower()
    
    ## normalizing the unicode characters
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    ## removing special characters
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    
    return string

In [10]:
string_test = basic_clean(string_function) ## testing our function

string_test ## <-- quality assurance check

'the rumors are true the time has arrived codeup has officially opened applications to our new data science career accelerator with only 25 seats available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in america\ndata science is a'

#### Question 2:

Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [11]:
#create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

#use the tokenizer
print(tokenizer.tokenize(string_test, return_str=True))

the rumors are true the time has arrived codeup has officially opened applications to our new data science career accelerator with only 25 seats available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in america
data science is a


In [12]:
def tokenize(string):
    '''
    This function is designed to take in a string and tokenize all 
    the words in the string
    '''
    #create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    #use the tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string   

In [13]:
string = tokenize(string_test) ## doing our function thing

string ## <-- quality assurance check

'the rumors are true the time has arrived codeup has officially opened applications to our new data science career accelerator with only 25 seats available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in america\ndata science is a'

#### Question 3:

Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [14]:
# Create the nltk stemmer object, then use it
ps = nltk.porter.PorterStemmer()

stems = [ps.stem(word) for word in string.split()]
strings_stemmed = ' '.join(stems)

strings_stemmed

'the rumor are true the time ha arriv codeup ha offici open applic to our new data scienc career acceler with onli 25 seat avail thi immers program is one of a kind in san antonio and will help you land a job in glassdoor 1 best job in america data scienc is a'

In [15]:
def stem(string):
    '''
    This function is designed to take in a string as input and stems the words 
    in the string 
    '''
    ## create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    
    ## used list comprehension on our split string to stem the words
    stems = [ps.stem(word) for word in string.split()]
    string_stemmed = ' '.join(stems)
    
    return string_stemmed

In [16]:
string = stem(string) ## using our function to stem the words
 
string ## <-- quality assurance check

'the rumor are true the time ha arriv codeup ha offici open applic to our new data scienc career acceler with onli 25 seat avail thi immers program is one of a kind in san antonio and will help you land a job in glassdoor 1 best job in america data scienc is a'

#### Question 4:

Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [17]:
def lemmatize(string):
    '''
    This function is designed to take in a string as input and lemmatize the words 
    in the string 
    '''
    ## create the nltk lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    
    ## use the lemmatizer object
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    string_lemmatized = ' '.join(lemmas)
    
    return string_lemmatized

In [18]:
string = lemmatize(string) ## using our function

string ## <-- quality assurance check

'the rumor are true the time ha arriv codeup ha offici open applic to our new data scienc career acceler with onli 25 seat avail thi immers program is one of a kind in san antonio and will help you land a job in glassdoor 1 best job in america data scienc is a'

#### Question 5: 

Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [19]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function is designed to take in a string as input and return the words 
    in the string after removing all the stopwords
    '''
    
    ## creating our stop word list from the english word list
    stopword_list = stopwords.words('english')
    
    ## removing excluded words
    for word in exclude_words:
        stopword_list.remove(word)
    
    ## adding extra words
    for word in extra_words:
        stopword_list.append(word)
    
    ## creating a list of words from the original inputted string after splitting them
    words = string.split()
    
    ## removing stop words and setting it equal to filtered_words
    filtered_words = [w for w in words if w not in stopword_list]
    
    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')
    
    ## recreating our string without stopwords
    article_without_stopwords = ' '.join(filtered_words)
    
    return article_without_stopwords

In [20]:
string = remove_stopwords(string) ## using our function to remove stop words

string ## <-- quality assurance check

Removed 18 stopwords
---


'rumor true time ha arriv codeup ha offici open applic new data scienc career acceler onli 25 seat avail thi immers program one kind san antonio help land job glassdoor 1 best job america data scienc'

#### Question 6:

Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.