In [5]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

from acquire import get_blog_articles, get_news_articles

Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [11]:
def basic_clean(text): 
    '''This function takes in a string, lowercases it, normalized unicode characters, 
    and replaces anything that is not a letter, number, whitespace or a single quote.'''
    # changes text to all lowercase
    lower_case = text.lower()
    # remove special characters, encode to ascii and recode to utf-8
    recode = unicodedata.normalize('NFKD', low_case).encode('ascii', 'ignore').decode('utf-8', 'ignore')
     # Replace anything that is not a letter, number, whitespace or a single quote
    clean_text = re.sub(r"[^a-z0-9'\s]", '', recode)
    return clean_text

Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [12]:
def tokenize(text):
    '''
    This function uses NLTK TlktokTokenizer to separate/tokenize text.
    '''
    # create the NLTK tokenizer object
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(text, return_str=True)

Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [None]:
def stem(text):
    '''
    This function applies NLTK stemming to text to remove prefix and suffixes.
    '''
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    article_stemmed = ' '.join(stems)
    return article_stemmed

Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [None]:
def lemmatize(text):
    '''
   This function applies NLTK lemming to text to remove prefix and suffixes.
    '''
    # Create the nltk lemmatize object, then use it
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized

Define a function named remove_stopwords. 
- It should accept some text and return the text after removing all the stopwords.
- This function should define two optional parameters, extra_words and exclude_words. 
    - These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [13]:
def remove_stopwords(text, extra_words=[], exclude_words=[]):
    '''
    This function takes in text, allows for additional word be or not be excluded and removes the stopword from the text.
    '''
    # define initial stopwords list
    stopword_list = stopwords.words('english')
    # add additional stopwords
    for word in extra_words:
        stopword_list.append(word)
    # remove stopwords to exclude from stopword list
    for word in exclude_words:
        stopword_list.remove(word)
    # split the string into words
    words = text.split()
    # filter the words
    filtered_words = [w for w in words if w not in stopword_list]
    # print number of stopwords removed
    # print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    # produce string without stopwords
    article_without_stopwords = ' '.join(filtered_words)
    return article_without_stopwords

Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [20]:
news_df= get_news_articles()

In [21]:
news_df.head()

Unnamed: 0,topic,title,author,content
0,business,Air India pilots demand vaccination on priorit...,Kiran Khatri,Indian Commercial Pilots Association (ICPA) on...
1,business,India underestimated the coronavirus: Raghuram...,Kiran Khatri,"Speaking about India's second COVID-19 wave, f..."
2,business,South Korea's richest woman gets fortune worth...,Anmol Sharma,South Korea’s richest woman Hong Ra-hee added ...
3,business,World's biggest jeweller says it will no longe...,Kiran Khatri,"Pandora, the world's biggest jeweller, has sai..."
4,business,"Will supply 11 cr doses to states, pvt hospita...",Kiran Khatri,Serum Institute of India (SII) CEO Adar Poonaw...


Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [18]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
        'https://codeup.com/data-science-myths/',
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']
codeup_df= get_blog_articles(urls)

In [19]:
codeup_df.head()

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [23]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function takes in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)\
                            .apply(lemmatize)
    
    df['stemmed'] = df[column].apply(basic_clean).apply(stem)
    
    df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)
    
    return df[['topic', 'title', column, 'stemmed', 'lemmatized', 'clean']]

In [26]:
news_df= prep_article_data(news_df, column, extra_words=[], exclude_words=[])

NameError: name 'column' is not defined

Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?