# Data Preparation Exercises

## Imports

In [27]:
# unicode, regex, json for text digestion
import unicodedata
import re
import json

# nltk: natural language toolkit -> tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
import acquire
from time import strftime

import warnings
warnings.filterwarnings('ignore')

## Exercises

The end result of this exercise should be a file named `prepare.py` that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:

* Lowercase everything
* Normalize unicode characters
* Replace anything that is not a letter, number, whitespace or a single quote.

In [28]:
# This function takes in a string and returns the string normalized
def basic_clean(string):
    # we will normalize our data into standard NFKD unicode, feed it into an ascii encoding
    # decode it back into UTF-8
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    # remove special characters, then lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
    return string

### 2. Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.

In [29]:
# This functions takes in a string and returns a tokenized string
def tokenize(string):
    # make our tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # apply our tokenization to the string input
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

### 3. Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [30]:
# This function takes in a string and returns a string with words stemmed
def stem(string):
    # create our stemming object
    ps = nltk.porter.PorterStemmer()
    # use a list comprehension => stem each word inside of the entire document and split by single spaces
    stems = [ps.stem(word) for word in string.split()]
    # join it together with spaces
    string = ' '.join(stems)
    
    return string

### 4. Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [31]:
# This function takes in a string  and returns a string with words lemmatized
def lemmatize(string):
    # create our lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # use a list comprehension to lemmatize each word
    # string.split() => output a list of every token inside of the document
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # join the lemmas together with spaces
    string = ' '.join(lemmas)
    #return the altered document
    return string

### 5. Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords. 

### This function should define two optional parameters, `extra_words` and `exclude_words`. These parameters should define any additional stop words to include, and any words that we don't want to remove.



In [32]:
list1 = [1, 2, 3, 4]
list2 = [2, 1, 3, 4]

print(set(list1)==set(list2))

True


In [33]:
list1 == list2

False

In [34]:
mylist = ['a', 'b', 'c', 'c', 'd']

myset = set(mylist)

print(mylist, myset)

['a', 'b', 'c', 'c', 'd'] {'b', 'a', 'c', 'd'}


In [35]:
# This function takes in a string, optional extra_words and exclued_words parameters with default empty lists and returns a string
def remove_stopwords(string, extra_words = [], exclude_words = []):
    stopword_list = stopwords.words('english')
    # use set casting to remove any excluded stopwords
    stopword_set = set(stopword_list) - set(exclude_words)
    # add in extra words to stopwords set using a union
    stopword_set = stopword_set.union(set(extra_words))
    # split the document by spaces
    words = string.split()
    # every word in our document that is not a stopword
    filtered_words = [word for word in words if word not in stopword_set]
    # join it back together with spaces
    string_without_stopwords = ' '.join(filtered_words)
    return string_without_stopwords

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [36]:
news_df = acquire.get_news_articles_data()
news_df

Unnamed: 0,title,content,category
0,Some working overtime to harm us: Adani Group ...,Adani Group has reacted amid allegations of TM...,business
1,Oil prices steady above $90 as investors asses...,"Brent oil prices steadied above $90 (over ₹7,4...",business
2,SC rejects telcos' plea to see licence fee as ...,The Supreme Court on Monday rejected a request...,business
3,SpiceJet stock dip amid 'Gangwal not intereste...,SpiceJet's shares tanked 11% on Monday after a...,business
4,"HDFC Bank's Q2 profit jumps 50% to ₹15,976 crore",HDFC Bank on Monday reported a net profit of o...,business
5,What is the TCS bribes-for-jobs scandal?,The bribes-for-jobs scandal at Tata Consultanc...,business
6,Former Bank of China Chairman Liu arrested ove...,"Liu Liange, who resigned as the Chairman of Ba...",business
7,"BioNTech warns of write-off of up to ₹7,888 cr...",Germany's BioNTech flagged write-downs of up t...,business
8,Rupee hits 1-year low of 83.28 against US dollar,The Indian Rupee hit a one-year low of 83.28 a...,business
9,Activision Blizzard CEO to leave firm with $40...,Activision Blizzard CEO Bobby Kotick will leav...,business


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [37]:
codeup_df = acquire.get_blog_articles_data()
codeup_df

Unnamed: 0,title,content
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...


### 8. For each dataframe, produce the following columns:

* `title` to hold the title
* `original` to hold the original article/post content
* `clean` to hold the normalized and tokenized original with the stopwords removed.
* `stemmed` to hold the stemmed version of the cleaned data.
* `lemmatized` to hold the lemmatized version of the cleaned data.

In [38]:
# Work flow:
#  1. clean: normalized/tokenized, with stopwords removed apply: basic_clean, tokenize, remove_stopwords

#  2. stemmed: stemmed version of cleaned data apply: stem function onto cleaned data

#  3. lemmatized: lemmatized version of cleaned data apply: lemmatize function onto cleaned datas

In [39]:
news_df.rename(columns={'content': 'original'}, inplace=True)
codeup_df.rename(columns={'content': 'original'}, inplace=True)

In [41]:
news_df.head()

Unnamed: 0,title,original,category
0,Some working overtime to harm us: Adani Group ...,Adani Group has reacted amid allegations of TM...,business
1,Oil prices steady above $90 as investors asses...,"Brent oil prices steadied above $90 (over ₹7,4...",business
2,SC rejects telcos' plea to see licence fee as ...,The Supreme Court on Monday rejected a request...,business
3,SpiceJet stock dip amid 'Gangwal not intereste...,SpiceJet's shares tanked 11% on Monday after a...,business
4,"HDFC Bank's Q2 profit jumps 50% to ₹15,976 crore",HDFC Bank on Monday reported a net profit of o...,business


In [42]:
codeup_df.head()

Unnamed: 0,title,original
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...


In [43]:
# This function takes in a df and the string name for a text column with option to pass lists for extra_words and exclude_words
# and returns a df with the text article title, original text, stemmmed text, lemmatized text, cleaned, tokenized, & lemmatized 
# text with stopwords removed
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [45]:
prep_article_data(news_df, 'original', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Some working overtime to harm us: Adani Group ...,Adani Group has reacted amid allegations of TM...,adani group reacted amid allegations tmc mp ma...,adani group react amid alleg tmc mp mahua moit...,adani group reacted amid allegation tmc mp mah...
1,Oil prices steady above $90 as investors asses...,"Brent oil prices steadied above $90 (over ₹7,4...",brent oil prices steadied 90 7492 per barrel m...,brent oil price steadi 90 7492 per barrel mond...,brent oil price steadied 90 7492 per barrel mo...
2,SC rejects telcos' plea to see licence fee as ...,The Supreme Court on Monday rejected a request...,supreme court monday rejected request telecomm...,suprem court monday reject request telecommun ...,supreme court monday rejected request telecomm...
3,SpiceJet stock dip amid 'Gangwal not intereste...,SpiceJet's shares tanked 11% on Monday after a...,spicejet ' shares tanked 11 monday report clai...,spicejet ' share tank 11 monday report claim i...,spicejet ' share tanked 11 monday report claim...
4,"HDFC Bank's Q2 profit jumps 50% to ₹15,976 crore",HDFC Bank on Monday reported a net profit of o...,hdfc bank monday reported net profit 15976 cro...,hdfc bank monday report net profit 15976 crore...,hdfc bank monday reported net profit 15976 cro...


In [46]:
prep_article_data(codeup_df, 'original', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...,may traditionally known asian american pacific...,may tradit known asian american pacif island a...,may traditionally known asian american pacific...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women tech panelist spotlight magdalena rahn c...,women tech panelist spotlight magdalena rahn c...,woman tech panelist spotlight magdalena rahn c...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women tech panelist spotlight rachel robbinsma...,women tech panelist spotlight rachel robbinsma...,woman tech panelist spotlight rachel robbinsma...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...,women tech panelist spotlight sarah mellor cod...,women tech panelist spotlight sarah mellor cod...,woman tech panelist spotlight sarah mellor cod...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...,women tech panelist spotlight madeleine capper...,women tech panelist spotlight madelein capper ...,woman tech panelist spotlight madeleine capper...


### 9. Ask yourself:

* If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
     - lemm
* If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
     - lemm
* If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
     - stemm