In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

from acquire import * 

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

> * Lowercase everything
> * Normalize unicode characters
> * Replace anything that is not a letter, number, whitespace or a single quote.


In [2]:
test = 'real words here'

In [3]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

### 2. Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.

In [4]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

### 3. Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [5]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

### 4. Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [6]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

### 5. Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.

### This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [7]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    return ' '.join(filtered_words)


### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [8]:
# Fetch news article DataFrame
news_df = get_news_articles()
news_df.head()

Unnamed: 0_level_0,parsed,news_articles,author,date
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://inshorts.com/en/news/airstrike-hits-capital-of-ethiopias-tigray-3-killed-report-1635433206925,True,"An Ethiopian airstrike reportedly hit Mekelle,...",Dharini MudgalDharini Mudgal,28 Oct
https://inshorts.com/en/read/technology,True,Facebook on Thursday announced it's changing t...,Pragya SwastikPragya SwastikAnkush VermaAnkush...,29 Oct28 Oct29 Oct28 Oct28 Oct29 Oct28 Oct29 O...
https://inshorts.com/en/read/startup,True,The combined net worth of the world's two rich...,Kiran KhatriKiran KhatriAnkush VermaAnkush Ver...,28 Oct28 Oct27 Oct27 Oct27 Oct28 Oct28 Oct27 O...
https://inshorts.com/prev/en/news/airstrike-hits-capital-of-ethiopias-tigray-3-killed-report-1635433206925,True,After Aryan Khan was granted bail in the Mumba...,Kriti KambiriKriti Kambiri,28 Oct
https://inshorts.com/en/read/science,True,"'Star Trek' actor William Shatner, who became ...",Daisy MowkeDaisy MowkeAnkush VermaAnkush Verma...,16 Oct23 Oct28 Oct20 Oct21 Oct21 Oct29 Oct21 O...


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [9]:
codeup_df = get_codeup_blogs()
codeup_df.head()

Unnamed: 0_level_0,parsed,original,title,label
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://codeup.com/blog/,True,"Oct 28, 2021 | IT Training, Tips for Prospecti...",Blog - Codeup,blog
https://codeup.com/codeup-news/codeup-acquires-rackspace-cloud-academy/,True,"Apr 16, 2021 | Codeup NewsWe are thrilled to o...",Codeup Acquires Rackspace Cloud Academy! - Codeup,not_blog
https://codeup.com/codeup-news/start-a-new-career-with-vet-tec/,True,"Apr 5, 2021 | Codeup NewsAre you a veteran loo...",Start a New Career with VET TEC! - Codeup,not_blog
https://codeup.com/blog/page/2/?et_blog,True,"Oct 3, 2021 | Behind the BillboardsSep 16, 202...",Blog - Codeup,blog
https://codeup.com/,True,Go from no experience to employed tech profess...,Best Tech Career Accelerator in Texas - Codeup,not_blog


### 8. For each dataframe, produce the following columns:

> * `title` to hold the title
> * `original` to hold the original article/post content
> * `clean` to hold the normalized and tokenized original with the stopwords removed.
> * `stemmed` to hold the stemmed version of the cleaned data.
> * `lemmatized` to hold the lemmatized version of the cleaned data.

In [10]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df


In [11]:
codeup_df_clean = prep_article_data(codeup_df, 'original')
codeup_df_clean.head()

Unnamed: 0_level_0,parsed,original,title,label,clean,stemmed,lemmatized
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
https://codeup.com/blog/,True,"Oct 28, 2021 | IT Training, Tips for Prospecti...",Blog - Codeup,blog,oct 28 2021 training tips prospective students...,oct 28 2021 train tip prospect studentswhat ca...,oct 28 2021 training tip prospective studentsw...
https://codeup.com/codeup-news/codeup-acquires-rackspace-cloud-academy/,True,"Apr 16, 2021 | Codeup NewsWe are thrilled to o...",Codeup Acquires Rackspace Cloud Academy! - Codeup,not_blog,apr 16 2021 codeup newswe thrilled officially ...,apr 16 2021 codeup newsw thrill offici announc...,apr 16 2021 codeup newswe thrilled officially ...
https://codeup.com/codeup-news/start-a-new-career-with-vet-tec/,True,"Apr 5, 2021 | Codeup NewsAre you a veteran loo...",Start a New Career with VET TEC! - Codeup,not_blog,apr 5 2021 codeup newsare veteran looking next...,apr 5 2021 codeup newsar veteran look next car...,apr 5 2021 codeup newsare veteran looking next...
https://codeup.com/blog/page/2/?et_blog,True,"Oct 3, 2021 | Behind the BillboardsSep 16, 202...",Blog - Codeup,blog,oct 3 2021 behind billboardssep 16 2021 codeup...,oct 3 2021 behind billboardssep 16 2021 codeup...,oct 3 2021 behind billboardssep 16 2021 codeup...
https://codeup.com/,True,Go from no experience to employed tech profess...,Best Tech Career Accelerator in Texas - Codeup,not_blog,go experience employed tech professional less ...,go experi employ tech profession less 6 month ...,go experience employed tech professional le 6 ...


In [12]:
news_df_clean = prep_article_data(news_df, 'news_articles')
news_df_clean.head()

Unnamed: 0_level_0,parsed,news_articles,author,date,clean,stemmed,lemmatized
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
https://inshorts.com/en/news/airstrike-hits-capital-of-ethiopias-tigray-3-killed-report-1635433206925,True,"An Ethiopian airstrike reportedly hit Mekelle,...",Dharini MudgalDharini Mudgal,28 Oct,ethiopian airstrike reportedly hit mekelle cap...,ethiopian airstrik reportedli hit mekel capit ...,ethiopian airstrike reportedly hit mekelle cap...
https://inshorts.com/en/read/technology,True,Facebook on Thursday announced it's changing t...,Pragya SwastikPragya SwastikAnkush VermaAnkush...,29 Oct28 Oct29 Oct28 Oct28 Oct29 Oct28 Oct29 O...,facebook thursday announced changing companys ...,facebook thursday announc chang compani name m...,facebook thursday announced changing company n...
https://inshorts.com/en/read/startup,True,The combined net worth of the world's two rich...,Kiran KhatriKiran KhatriAnkush VermaAnkush Ver...,28 Oct28 Oct27 Oct27 Oct27 Oct28 Oct28 Oct27 O...,combined net worth worlds two richest persons ...,combin net worth world two richest person elon...,combined net worth world two richest person el...
https://inshorts.com/prev/en/news/airstrike-hits-capital-of-ethiopias-tigray-3-killed-report-1635433206925,True,After Aryan Khan was granted bail in the Mumba...,Kriti KambiriKriti Kambiri,28 Oct,aryan khan granted bail mumbai cruise drugs ca...,aryan khan wa grant bail mumbai cruis drug cas...,aryan khan wa granted bail mumbai cruise drug ...
https://inshorts.com/en/read/science,True,"'Star Trek' actor William Shatner, who became ...",Daisy MowkeDaisy MowkeAnkush VermaAnkush Verma...,16 Oct23 Oct28 Oct20 Oct21 Oct21 Oct29 Oct21 O...,star trek actor william shatner became oldest ...,star trek actor william shatner becam oldest p...,star trek actor william shatner became oldest ...


In [14]:
url = 'https://codeup.com/podcast/'
new_urls = check_url(url)
print(new_urls)

STATUS CODE 200
