# Prepare Exercises

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/desireemcelroy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/desireemcelroy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

#### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [3]:
def basic_clean(string):
    '''
    This function takes in a string and normalizes it for nlp purposes
    '''
    # lowercase the string
    string = string.lower()
    # return normal form for the unicode string, encode/remove ascii
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    # breaks down the string by keeping alphabet letters, numbers, apostraphes and spaces
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    
    return string

In [4]:
original = basic_clean(original)

original

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

#### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [5]:
def tokenize(string):
    '''
    This function takes in a string and tokenizes it
    '''
    # create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # use the tokenizer, return as a string
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

In [6]:
original = tokenize(original)

original

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

#### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [7]:
def stem(text):
    
    # create a porter stemmer
    ps = nltk.porter.PorterStemmer()
    
    # loop through the text to stem the words
    stems = [ps.stem(word) for word in text.split()]
    
    stems = ' '.join(stems)
    
    return stems

In [8]:
stem(original)[:5]

'paul '

#### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [9]:
def lemmatize(text):
    
    # create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    # loop through the list to split and lemmatize
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    
    lemmas =' '.join(lemmas)
    
    return lemmas

In [10]:
lemmatize(original)[:5]

'paul '

#### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [13]:
def remove_stopwords(text):
    
    # pull the english stopwords
    stopword_list = stopwords.words('english')
    
    # split the inputted text into a list to loop through
    words = text.split()
    
    # loop through to remove stop words from the list
    filtered_words = [word for word in words if word not in stopword_list]
    
    filtered_words = ' '.join(filtered_words)
    
    return filtered_words

In [14]:
remove_stopwords(original)[:5]

'paul '

#### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [15]:
categories = ["business", "sports", "technology", "entertainment", "science", "world"]

news = acquire.get_all_news_articles(categories)



  soup = BeautifulSoup(response.text)


In [16]:
news.head()

Unnamed: 0,title,content,category
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,business
1,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,business
2,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,business
3,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,business
4,Factually incorrect: INOX on report of Amazon ...,INOX Leisure denied a report that claimed Amaz...,business


#### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [17]:
url_list = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
           'https://codeup.com/data-science-myths/',
           'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
           'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
           'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']


# create an empty list
list_of_blogs=[]

# create a for loop for all the urls in the list to pull elements from and return a dict
for url in url_list:
    list_of_blogs.append(acquire.get_blog_articles(url))



  soup = BeautifulSoup(response.text)


In [18]:
blogs = pd.DataFrame(list_of_blogs)

blogs

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


#### 8. For each dataframe, produce the following columns:

- title - to hold the title
- original - to hold the original article/post content
- clean - to hold the normalized and tokenized original with the stopwords removed.
- stemmed - to hold the stemmed version of the cleaned data.
- lemmatized - to hold the lemmatized version of the cleaned data.

#### First apply to news dataframe

In [19]:
news.head()

Unnamed: 0,title,content,category
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,business
1,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,business
2,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,business
3,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,business
4,Factually incorrect: INOX on report of Amazon ...,INOX Leisure denied a report that claimed Amaz...,business


In [23]:
# first for the news df

# create a variable for the cleaned version
clean=[] 
for i in news.content:
  
    normalize = basic_clean(i)
    
    clean.append(tokenize(normalize))

# create column for cleaned content
news['clean'] = clean
    
stemmed=[] 
for i in news.clean:
  
    stemmed.append(stem(i))
    
    
lemmatized=[] 
for i in news.clean:
  
    lemmatized.append(lemmatize(i))

In [24]:
# create columns for stemmed and lemmatized content
news['stemmed'] = stemmed
news['lemmatized'] = lemmatized

In [25]:
news.head()

Unnamed: 0,title,content,category,clean,stemmed,lemmatized
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,business,reliance industries has said in a statement th...,relianc industri ha said in a statement that o...,reliance industry ha said in a statement that ...
1,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,business,tesla ceo and the world ' s secondrichest pers...,tesla ceo and the world ' s secondrichest pers...,tesla ceo and the world ' s secondrichest pers...
2,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,business,tesla ' s billionaire ceo elon musk criticised...,tesla ' s billionair ceo elon musk criticis ap...,tesla ' s billionaire ceo elon musk criticised...
3,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,business,amazon on monday denied speculations that it w...,amazon on monday deni specul that it wa look t...,amazon on monday denied speculation that it wa...
4,Factually incorrect: INOX on report of Amazon ...,INOX Leisure denied a report that claimed Amaz...,business,inox leisure denied a report that claimed amaz...,inox leisur deni a report that claim amazon in...,inox leisure denied a report that claimed amaz...


#### Now apply to blogs dataframe

In [29]:
# now for the blogs df

# create a variable for the cleaned version
clean=[] 
for i in blogs.content:
  
    normalize = basic_clean(i)
    
    clean.append(tokenize(normalize))

# create column for cleaned content
blogs['clean'] = clean
    
stemmed=[] 
for i in blogs.clean:
  
    stemmed.append(stem(i))
    
    
lemmatized=[] 
for i in blogs.clean:
  
    lemmatized.append(lemmatize(i))

In [30]:
# create column for stemmed and lemmatized versions
blogs['stemmed'] = stemmed
blogs['lemmatized'] = lemmatized

In [31]:
blogs

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,the rumors are true the time has arrived codeu...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,by dimitri antoniou and maggie giust\ndata sci...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",by dimitri antoniou\na week ago codeup launche...,by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair\nthe third biannual san anton...,sa tech job fair the third biannual san antoni...,sa tech job fair the third biannual san antoni...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps are closing is the model ...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...


In [32]:
blogs.stemmed[0]

'the rumor are true the time ha arriv codeup ha offici open applic to our new data scienc career acceler with onli 25 seat avail thi immers program is one of a kind in san antonio and will help you land a job in glassdoor 1 best job in america data scienc is a method of provid action intellig from data the data revolut ha hit san antonio result in an explos in data scientist posit across compani like usaa accentur booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecur center and school of data scienc we built a program to specif meet the grow demand of thi industri our program will be 18 week long fulltim handson and projectbas our curriculum develop and instruct is led by senior data scientist maggi giust who ha work at heb capit group and rackspac along with input from dozen of practition and hire partner student will work with real data set realist problem and the entir data scienc pipelin from collect to deploy they will receiv profession develop train in resu

In [33]:
blogs.lemmatized[0]

'the rumor are true the time ha arrived codeup ha officially opened application to our new data science career accelerator with only 25 seat available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in america data science is a method of providing actionable intelligence from data the data revolution ha hit san antonio resulting in an explosion in data scientist position across company like usaa accenture booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecurity center and school of data science we built a program to specifically meet the growing demand of this industry our program will be 18 week long fulltime handson and projectbased our curriculum development and instruction is led by senior data scientist maggie giust who ha worked at heb capital group and rackspace along with input from dozen of practitioner and hiring partner student will work with real data set realistic problem and the entire dat