In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [2]:
# define categories
categories = ["business", "sports", "technology", "entertainment"]

# use get_all_new_article function from acquire.py file 

news_df = acquire.get_all_news_articles(categories)



  soup = BeautifulSoup(response.text)


In [3]:
# look at the head of dataframe
news_df.head()

Unnamed: 0,title,content,category
0,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,business
1,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",business
2,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",business
3,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,business
4,India announced triumph over COVID-19 early: U...,Confederation of Indian Industry (CII) Preside...,business


In [4]:
# lets use the content of first news item as 'article' to test my functions

article = news_df.content[0]
article

'Indian Commercial Pilots Association (ICPA) on Tuesday said, "If Air India fails to set up vaccination camps on a pan-India basis for...flying crew above the age of 18 years on priority, we\'ll stop work." In a letter to the airline\'s management, ICPA added, "With no healthcare support...no insurance...we\'re in no position to continue risking lives of our pilots without vaccination."'

#### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

Lowercase everything
Normalize unicode characters
Replace anything that is not a letter, number, whitespace or a single quote.

In [5]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKC', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [6]:
# use the function defined above

basic_clean(article)

'indian commercial pilots association icpa on tuesday said if air india fails to set up vaccination camps on a panindia basis forflying crew above the age of 18 years on priority well stop work in a letter to the airlines management icpa added with no healthcare supportno insurancewere in no position to continue risking lives of our pilots without vaccination'

#### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [7]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

In [8]:
# use the function defined above

tokenize(article)

'Indian Commercial Pilots Association ( ICPA ) on Tuesday said , " If Air India fails to set up vaccination camps on a pan-India basis for ... flying crew above the age of 18 years on priority , we \' ll stop work. " In a letter to the airline \' s management , ICPA added , " With no healthcare support ... no insurance ... we \' re in no position to continue risking lives of our pilots without vaccination . "'

#### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [9]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [10]:
# use the function defined above

stem(article)

'indian commerci pilot associ (icpa) on tuesday said, "if air india fail to set up vaccin camp on a pan-india basi for...fli crew abov the age of 18 year on priority, we\'ll stop work." In a letter to the airline\' management, icpa added, "with no healthcar support...no insurance...we\'r in no posit to continu risk live of our pilot without vaccination."'

#### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [11]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [12]:
# use the function defined above

lemmatize(article)

'Indian Commercial Pilots Association (ICPA) on Tuesday said, "If Air India fails to set up vaccination camp on a pan-India basis for...flying crew above the age of 18 year on priority, we\'ll stop work." In a letter to the airline\'s management, ICPA added, "With no healthcare support...no insurance...we\'re in no position to continue risking life of our pilot without vaccination."'

#### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [13]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords
    
    

In [14]:
# use the function defined above

remove_stopwords(article)

'Indian Commercial Pilots Association (ICPA) Tuesday said, "If Air India fails set vaccination camps pan-India basis for...flying crew age 18 years priority, we\'ll stop work." In letter airline\'s management, ICPA added, "With healthcare support...no insurance...we\'re position continue risking lives pilots without vaccination."'

#### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [15]:
# check head of my news_df dataframe:

news_df.head()

Unnamed: 0,title,content,category
0,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,business
1,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",business
2,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",business
3,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,business
4,India announced triumph over COVID-19 early: U...,Confederation of Indian Industry (CII) Preside...,business


In [17]:
# use all the functions to see if they work on news_df's content column

news_df['content'].apply(basic_clean)\
.apply(tokenize)\
.apply(lemmatize)\
.apply(remove_stopwords)

0     indian commercial pilot association icpa tuesd...
1     speaking india second covid19 wave former rbi ...
2     pandora world biggest jeweller ha said itll st...
3     south korea richest woman hong rahee added ano...
4     confederation indian industry cii president ud...
                            ...                        
94    actress kangana ranauts twitter account wa per...
95    marvel video titled marvel studio celebrates m...
96    calling ongoing coronavirus pandemic india dar...
97    amid report director ranjit tewaris bell botto...
98    speaking india second covid19 wave actor vinee...
Name: content, Length: 99, dtype: object

#### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [18]:
codeup_df = acquire.acquire_codeup_blog()



  soup = BeautifulSoup(response.text)


In [19]:
codeup_df.head()

Unnamed: 0,title,published_date,blog_image,content
0,Codeup’s Data Science Career Accelerator is Here!,"September 30, 2018",https://codeup.com/wp-content/uploads/2018/10/...,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,"October 31, 2018",https://codeup.com/wp-content/uploads/2018/10/...,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"October 17, 2018",https://codeup.com/wp-content/uploads/2018/10/...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,"August 14, 2018",,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,"August 14, 2018",,Competitor Bootcamps Are Closing. Is the Model...


#### 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.


In [29]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [30]:
# use the function defined above for news_df's content column.

prep_article_data(news_df, 'content', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,indian commercial pilots association icpa tues...,indian commerci pilot associ icpa tuesday said...,indian commercial pilot association icpa tuesd...
1,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",speaking indias second covid19 wave former rbi...,speak india second covid19 wave former rbi gov...,speaking india second covid19 wave former rbi ...
2,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",pandora worlds biggest jeweller said itll stop...,pandora world biggest jewel said itll stop use...,pandora world biggest jeweller said itll stop ...
3,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,south koreas richest woman hong rahee added an...,south korea richest woman hong rahe ad anoth 7...,south korea richest woman hong rahee added ano...
4,India announced triumph over COVID-19 early: U...,Confederation of Indian Industry (CII) Preside...,confederation indian industry cii president ud...,confeder indian industri cii presid uday kotak...,confederation indian industry cii president ud...
