# Data Preparation Exercises

## Imports

In [1]:
# unicode, regex, json for text digestion
import unicodedata
import re
import json

# nltk: natural language toolkit -> tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# pandas dataframe manipulation, acquire script, time formatting
import pandas as pd
import acquire
from time import strftime

# shh, down in front
import warnings
warnings.filterwarnings('ignore')

## Exercises

The end result of this exercise should be a file named `prepare.py` that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:

* Lowercase everything
* Normalize unicode characters
* Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
# we will define a basic_clean function for a single document (one string)
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    # we will normalize our data into standard NFKD unicode, feed it into an ascii encoding
    # decode it back into UTF-8
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    # utilize our regex substitution to remove our undesirable characters, then lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
    return string

### 2. Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.

In [3]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # make our tokenizer, taken from nltk's ToktokTokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # apply our tokenizer's tokenization to the string being input, ensure it returns a string
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

### 3. Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [4]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # create our stemming object
    ps = nltk.porter.PorterStemmer()
    # use a list comprehension => stem each word for each word inside of the entire document,
    # split by the default, which are single spaces
    stems = [ps.stem(word) for word in string.split()]
    # glue it back together with spaces, as it was before
    string = ' '.join(stems)
    
    return string

### 4. Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [5]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # create our lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # use a list comprehension to lemmatize each word
    # string.split() => output a list of every token inside of the document
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # glue the lemmas back together by the strings we split on
    string = ' '.join(lemmas)
    #return the altered document
    return string

### 5. Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords. 

### This function should define two optional parameters, `extra_words` and `exclude_words`. These parameters should define any additional stop words to include, and any words that we don't want to remove.



In [6]:
list1 = [1, 2, 3, 4]
list2 = [2, 1, 3, 4]

print(set(list1)==set(list2))

True


In [7]:
list1 == list2

False

In [8]:
mylist = ['a', 'b', 'c', 'c', 'd']

myset = set(mylist)

print(mylist, myset)

['a', 'b', 'c', 'c', 'd'] {'d', 'c', 'b', 'a'}


In [9]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # assign our stopwords from nltk into stopword_list
    stopword_list = stopwords.words('english')
    # utilizing set casting, i will remove any excluded stopwords
    stopword_set = set(stopword_list) - set(exclude_words)
    # add in any extra words to my stopwords set using a union
    stopword_set = stopword_set.union(set(extra_words))
    # split our document by spaces
    words = string.split()
    # every word in our document, as long as that word is not in our stopwords
    filtered_words = [word for word in words if word not in stopword_set]
    # glue it back together with spaces, as it was so it shall be
    string_without_stopwords = ' '.join(filtered_words)
    # return the document back
    return string_without_stopwords

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [10]:
news_df = acquire.get_news_articles_data()
news_df

Unnamed: 0,title,content,category
0,Cybersecurity firm Secureworks to lay off 15% ...,Secureworks said it will lay off 15% of its wo...,business
1,UK basic wage growth hits record high of 7.8%,Basic wages in UK hit a new record growth rate...,business
2,Nigeria inflation rises to 18-year high,Nigeria's annual inflation rose to its highest...,business
3,Gangwal to sell IndiGo stake worth $450 mn via...,Indigo airline promoter Gangwal family led by ...,business
4,No proposal to restrict more electronics' impo...,There's no proposal at present to impose impor...,business
5,"Infosys, Liberty Global ink $1.6-bn deal to sc...",Infosys and Liberty Global announced a five-ye...,business
6,Cello World files IPO papers with SEBI to rais...,Household products maker Cello World has filed...,business
7,Binance files for protective order against US SEC,Binance filed for a protective court order aga...,business
8,India taking part in global economy brought st...,Prime Minister Narendra Modi said India's part...,business
9,"Michael Burry's Scion exits Alibaba, JD.com st...",Michael Burry's Scion Asset Management exited ...,business


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [11]:
codeup_df = acquire.get_blog_articles_data()
codeup_df

Unnamed: 0,title,content
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...


### 8. For each dataframe, produce the following columns:

* `title` to hold the title
* `original` to hold the original article/post content
* `clean` to hold the normalized and tokenized original with the stopwords removed.
* `stemmed` to hold the stemmed version of the cleaned data.
* `lemmatized` to hold the lemmatized version of the cleaned data.

In [12]:
# what we want:
#  -clean: normalized/tokenized, with stopwords removed
#         apply: basic_clean, tokenize, remove_stopwords
#  -stemmed: stemmed version of cleaned data
            # apply: stem function onto cleaned data
#  -lemmatized: lemmatized version of cleaned data
            # apply: lemmatize function onto cleaned datas

In [13]:
# df['some_col'] = df['old_col'].apply(some_function)

In [14]:
news_df.rename(columns={'content': 'original'}, inplace=True)
codeup_df.rename(columns={'content': 'original'}, inplace=True)

In [15]:
news_df

Unnamed: 0,title,original,category
0,Cybersecurity firm Secureworks to lay off 15% ...,Secureworks said it will lay off 15% of its wo...,business
1,UK basic wage growth hits record high of 7.8%,Basic wages in UK hit a new record growth rate...,business
2,Nigeria inflation rises to 18-year high,Nigeria's annual inflation rose to its highest...,business
3,Gangwal to sell IndiGo stake worth $450 mn via...,Indigo airline promoter Gangwal family led by ...,business
4,No proposal to restrict more electronics' impo...,There's no proposal at present to impose impor...,business
5,"Infosys, Liberty Global ink $1.6-bn deal to sc...",Infosys and Liberty Global announced a five-ye...,business
6,Cello World files IPO papers with SEBI to rais...,Household products maker Cello World has filed...,business
7,Binance files for protective order against US SEC,Binance filed for a protective court order aga...,business
8,India taking part in global economy brought st...,Prime Minister Narendra Modi said India's part...,business
9,"Michael Burry's Scion exits Alibaba, JD.com st...",Michael Burry's Scion Asset Management exited ...,business


In [16]:
codeup_df

Unnamed: 0,title,original
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...


In [17]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [18]:
prep_article_data(news_df, 'original', extra_words = ['ha'], exclude_words = ['no'])

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Cybersecurity firm Secureworks to lay off 15% ...,Secureworks said it will lay off 15% of its wo...,secureworks said lay 15 workforce cybersecurit...,securework said lay 15 workforc cybersecur fir...,secureworks said lay 15 workforce cybersecurit...
1,UK basic wage growth hits record high of 7.8%,Basic wages in UK hit a new record growth rate...,basic wages uk hit new record growth rate 78 f...,basic wage uk hit new record growth rate 78 fu...,basic wage uk hit new record growth rate 78 fu...
2,Nigeria inflation rises to 18-year high,Nigeria's annual inflation rose to its highest...,nigeria ' annual inflation rose highest level ...,nigeria ' annual inflat rose highest level 18 ...,nigeria ' annual inflation rose highest level ...
3,Gangwal to sell IndiGo stake worth $450 mn via...,Indigo airline promoter Gangwal family led by ...,indigo airline promoter gangwal family led rak...,indigo airlin promot gangwal famili led rakesh...,indigo airline promoter gangwal family led rak...
4,No proposal to restrict more electronics' impo...,There's no proposal at present to impose impor...,' no proposal present impose import restrictio...,' no propos present impos import restrict elec...,' no proposal present impose import restrictio...
5,"Infosys, Liberty Global ink $1.6-bn deal to sc...",Infosys and Liberty Global announced a five-ye...,infosys liberty global announced fiveyear agre...,infosi liberti global announc fiveyear agreeme...,infosys liberty global announced fiveyear agre...
6,Cello World files IPO papers with SEBI to rais...,Household products maker Cello World has filed...,household products maker cello world filed dra...,household product maker cello world file draft...,household product maker cello world filed draf...
7,Binance files for protective order against US SEC,Binance filed for a protective court order aga...,binance filed protective court order us securi...,binanc file protect court order us secur excha...,binance filed protective court order u securit...
8,India taking part in global economy brought st...,Prime Minister Narendra Modi said India's part...,prime minister narendra modi said india ' part...,prime minist narendra modi said india ' partic...,prime minister narendra modi said india ' part...
9,"Michael Burry's Scion exits Alibaba, JD.com st...",Michael Burry's Scion Asset Management exited ...,michael burry ' scion asset management exited ...,michael burri ' scion asset manag exit stake a...,michael burry ' scion asset management exited ...


In [19]:
prep_article_data(codeup_df, 'original', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...,may traditionally known asian american pacific...,may tradit known asian american pacif island a...,may traditionally known asian american pacific...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women tech panelist spotlight magdalena rahn c...,women tech panelist spotlight magdalena rahn c...,woman tech panelist spotlight magdalena rahn c...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women tech panelist spotlight rachel robbinsma...,women tech panelist spotlight rachel robbinsma...,woman tech panelist spotlight rachel robbinsma...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...,women tech panelist spotlight sarah mellor cod...,women tech panelist spotlight sarah mellor cod...,woman tech panelist spotlight sarah mellor cod...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...,women tech panelist spotlight madeleine capper...,women tech panelist spotlight madelein capper ...,woman tech panelist spotlight madeleine capper...


### 9. Ask yourself:

* If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
* If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
* If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?