In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

1. Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:
- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
string = "You know, house elves get a very raw deal! said Hermione indignantly. \
It's slavery, that's what it is! That Mr. Crouch made her go up to the top of the \
stadium, and she was terrified, and he's got her bewitched so she can't even run when \
they start trampling tents!\nWhy doesn't anyone do something about it?"

string

"You know, house elves get a very raw deal! said Hermione indignantly. It's slavery, that's what it is! That Mr. Crouch made her go up to the top of the stadium, and she was terrified, and he's got her bewitched so she can't even run when they start trampling tents!\nWhy doesn't anyone do something about it?"

In [3]:
# Make everything lowercase
string = string.lower()
string

"you know, house elves get a very raw deal! said hermione indignantly. it's slavery, that's what it is! that mr. crouch made her go up to the top of the stadium, and she was terrified, and he's got her bewitched so she can't even run when they start trampling tents!\nwhy doesn't anyone do something about it?"

In [4]:
# removes incosistencies in the text
string = unicodedata.normalize('NFKD', string)\
.encode('ascii', 'ignore')\
.decode('utf-8', 'ignore')
string

"you know, house elves get a very raw deal! said hermione indignantly. it's slavery, that's what it is! that mr. crouch made her go up to the top of the stadium, and she was terrified, and he's got her bewitched so she can't even run when they start trampling tents!\nwhy doesn't anyone do something about it?"

In [5]:
# set what to keep
string = re.sub(r"[^a-z0-9'\s]", '', string)
string

"you know house elves get a very raw deal said hermione indignantly it's slavery that's what it is that mr crouch made her go up to the top of the stadium and she was terrified and he's got her bewitched so she can't even run when they start trampling tents\nwhy doesn't anyone do something about it"

In [6]:
og_string = "You know, house elves get a very raw deal! said Hermione indignantly. \
It's slavery, that's what it is! That Mr. Crouch made her go up to the top of the \
stadium, and she was terrified, and he's got her bewitched so she can't even run when \
they start trampling tents! Why doesn't anyone do something about it?"

def basic_clean(og_string):
    '''Takes in string
    makes everything lowercase
    removes incosistent text
    only keeps anything a-z, 0-9, ' and white space'''
    # make everything lowercase
    string = og_string.lower()
    # removes incosistencies in the text
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    # set what to keep
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    # return new cleaned string
    return string

string = basic_clean(string)

print(f'The original string looks like: \n \n', og_string)
print(f'\nThe now clean string looks like: \n \n', string)

The original string looks like: 
 
 You know, house elves get a very raw deal! said Hermione indignantly. It's slavery, that's what it is! That Mr. Crouch made her go up to the top of the stadium, and she was terrified, and he's got her bewitched so she can't even run when they start trampling tents! Why doesn't anyone do something about it?

The now clean string looks like: 
 
 you know house elves get a very raw deal said hermione indignantly it's slavery that's what it is that mr crouch made her go up to the top of the stadium and she was terrified and he's got her bewitched so she can't even run when they start trampling tents
why doesn't anyone do something about it


2. Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.


In [7]:
# Create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

In [8]:
# Use the tokenizer
string = tokenizer.tokenize(string, return_str = True)
string

"you know house elves get a very raw deal said hermione indignantly it ' s slavery that ' s what it is that mr crouch made her go up to the top of the stadium and she was terrified and he ' s got her bewitched so she can ' t even run when they start trampling tents\nwhy doesn ' t anyone do something about it"

In [9]:
string = basic_clean(string)

def tokenize(string):
    '''Takes in the string provided by basic_clean funciton
    creates a tokenizer
    uses the tokenizerr on the cleaned string'''
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    # return tokenized string
    return string

string = tokenize(string)
print(f'After tokeniing our string looks like: \n \n', string)

After tokeniing our string looks like: 
 
 you know house elves get a very raw deal said hermione indignantly it ' s slavery that ' s what it is that mr crouch made her go up to the top of the stadium and she was terrified and he ' s got her bewitched so she can ' t even run when they start trampling tents
why doesn ' t anyone do something about it



3. Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.



In [10]:
string = basic_clean(string)
string = tokenize(string)

In [11]:
# Create porter stemmer.
ps = nltk.porter.PorterStemmer()

In [12]:
ps.stem('indignantly')

'indignantli'

In [13]:
# Apply the stemmer to each word in our string

stems = [ps.stem(word) for word in string.split()]
stems[:10]

['you', 'know', 'hous', 'elv', 'get', 'a', 'veri', 'raw', 'deal', 'said']

In [14]:
# Join the list of words into the string
string_stemmed = ' '.join(stems)
string_stemmed

"you know hous elv get a veri raw deal said hermion indignantli it ' s slaveri that ' s what it is that mr crouch made her go up to the top of the stadium and she wa terrifi and he ' s got her bewitch so she can ' t even run when they start trampl tent whi doesn ' t anyon do someth about it"

In [15]:
string = basic_clean(string)
string = tokenize(string)

def stem(string):
    '''In string from the basic_clean and tokenize fucntion
    creaters the porter stemmer
    applies the porter stemmer to every word in the string provided
    joing the list of words back into a string'''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    # Apply the stemmer to each word in our string
    stems = [ps.stem(word) for word in string.split()]
    # Join the list of words into the string
    string_stemmed = ' '.join(stems)
    # return string_stemmed
    return string_stemmed

string_stemmed = stem(string)
string_stemmed

"you know hous elv get a veri raw deal said hermion indignantli it ' s slaveri that ' s what it is that mr crouch made her go up to the top of the stadium and she wa terrifi and he ' s got her bewitch so she can ' t even run when they start trampl tent whi doesn ' t anyon do someth about it"

4. Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.



In [16]:
string = basic_clean(string)
string = tokenize(string)

In [17]:
# Create the Lemmatizer.
wnl = nltk.stem.WordNetLemmatizer()

In [18]:
# Check lemmatizer. It works.
print(wnl.lemmatize('trample'))
print(wnl.lemmatize('very'))

trample
very


In [19]:
# Use the lemmatizer on each word using split

lemmas = [wnl.lemmatize(word) for word in string.split()]
lemmas[:10]

['you', 'know', 'house', 'elf', 'get', 'a', 'very', 'raw', 'deal', 'said']

In [20]:
# Join the list into a string

string_lemmatized = ' '.join(lemmas)
string_lemmatized

"you know house elf get a very raw deal said hermione indignantly it ' s slavery that ' s what it is that mr crouch made her go up to the top of the stadium and she wa terrified and he ' s got her bewitched so she can ' t even run when they start trampling tent why doesn ' t anyone do something about it"

In [21]:
string = basic_clean(string)
string = tokenize(string)

def lemmatize(string):
    '''Takes in string from basic_clean and tokenize funcitons
    creates a lematizer
    uses the lematizer on each word in the string
    merges the list of words back into string format
    and returns the now lematized string'''
    # Create the Lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    # Use the lemmatizer on each word using split
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # Join the list into a string
    string_lemmatized = ' '.join(lemmas)
    # return lemmatized string
    return string_lemmatized

string_lemmatized = lemmatize(string)
print(f'After lematizing the string looks like: \n \n', string_lemmatized)

After lematizing the string looks like: 
 
 you know house elf get a very raw deal said hermione indignantly it ' s slavery that ' s what it is that mr crouch made her go up to the top of the stadium and she wa terrified and he ' s got her bewitched so she can ' t even run when they start trampling tent why doesn ' t anyone do something about it


5. Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.
- This function should define two optional parameters, `extra_words` and `exclude_words`. These parameters should define any additional stop words to include, and any words that we don't want to remove.



In [22]:
#stopword_list = stopwords.words('english')

In [23]:
#len(stopword_list)

In [24]:
# Split the words in string
#string = string.split()

In [25]:
#filtered_words = [word for word in string if word not in stopword_list]
#filtered_words[:10]

In [26]:
# Join words in the list back into strings; assign to a variable to keep changes.
#string_without_stopwords = ' '.join(filtered_words)
#string_without_stopwords

In [27]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [28]:
string = basic_clean(string)
string = tokenize(string)

def remove_stopwords(string, exclude_words=[], extra_words=[]):
    '''takes in string from basic clean and tokenize fucntions
    takes in a list of words to exclude from the stopword list
    take sin a list of words to include in the stopword list
    makes the list of stopwords
    removes words listed from stopword list
    add words listed to stopword list
    remove words from stopword list from the string
    join words back to string format
    return new string'''
    # set stopword list 
    stopword_list = stopwords.words('english')
    # remove exclude_words list from stopword list
    stopword_list = set(stopword_list) - set(exclude_words)
    # add extra_wrods list to stopword list
    stopword_list = stopword_list.union(set(extra_words))
    # remove stopword list words from string
    string = string.split()
    # set filtered words value
    filtered_words = [word for word in string if word not in stopword_list]
    # join words back into string format 
    string = ' '.join(filtered_words)
    # return new string
    return string

string = remove_stopwords(string, exclude_words=[], extra_words=[])
string

"know house elves get raw deal said hermione indignantly ' slavery ' mr crouch made go top stadium terrified ' got bewitched ' even run start trampling tents ' anyone something"

### remove_these = ['or', 'no']
exlclude_words = stopword_list.remove(remove_these)

In [29]:
string = remove_stopwords(string, exclude_words=['or', 'no'], extra_words=[])
string

"know house elves get raw deal said hermione indignantly ' slavery ' mr crouch made go top stadium terrified ' got bewitched ' even run start trampling tents ' anyone something"

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.



In [31]:
news_df = acquire.acquire_news_articles()
news_df.head()

Unnamed: 0,title,content,category
0,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",business
1,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,business
2,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,business
3,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",business
4,IndiGo commences company vaccination drive for...,IndiGo Airlines' Chief HR Officer Raj Raghavan...,business


7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.



In [32]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
        'https://codeup.com/data-science-myths/',
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']
codeup_df = acquire.get_website_info(urls)
codeup_df



  soup = BeautifulSoup(response.text)


Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


8. For each dataframe, produce the following columns:
- `title` to hold the title
- `original` to hold the original article/post content
- `clean` to hold the normalized and tokenized original with the stopwords removed.
- `stemmed` to hold the stemmed version of the cleaned data.
- `lemmatized` to hold the lemmatized version of the cleaned data.



In [33]:
codeup_df.head()

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


In [34]:
codeup_df = codeup_df.rename(columns={'content':'original'})
codeup_df

Unnamed: 0,title,original
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


In [35]:
codeup_df['clean'] = pd.Series([basic_clean(string) for string in codeup_df.original])
codeup_df

Unnamed: 0,title,original,clean
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...,the rumors are true the time has arrived codeu...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...,by dimitri antoniou and maggie giust\ndata sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",by dimitri antoniou\na week ago codeup launche...
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair\nthe third biannual san anton...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps are closing is the model ...


In [36]:
codeup_df['stemmed'] = pd.Series([stem(string) for string in codeup_df.original])
codeup_df

Unnamed: 0,title,original,clean,stemmed
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...,the rumors are true the time has arrived codeu...,the rumor are true! the time ha arrived. codeu...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...,by dimitri antoniou and maggie giust\ndata sci...,By dimitri antoni and maggi giust data science...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",by dimitri antoniou\na week ago codeup launche...,"By dimitri antoni A week ago, codeup launch ou..."
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair\nthe third biannual san anton...,SA tech job fair the third bi-annu san antonio...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps are closing is the model ...,competitor bootcamp are closing. Is the model ...


In [37]:
codeup_df['lemmatized'] = pd.Series([lemmatize(string) for string in codeup_df.original])
codeup_df

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...,the rumors are true the time has arrived codeu...,the rumor are true! the time ha arrived. codeu...,The rumor are true! The time ha arrived. Codeu...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...,by dimitri antoniou and maggie giust\ndata sci...,By dimitri antoni and maggi giust data science...,By Dimitri Antoniou and Maggie Giust Data Scie...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",by dimitri antoniou\na week ago codeup launche...,"By dimitri antoni A week ago, codeup launch ou...","By Dimitri Antoniou A week ago, Codeup launche..."
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair\nthe third biannual san anton...,SA tech job fair the third bi-annu san antonio...,SA Tech Job Fair The third bi-annual San Anton...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps are closing is the model ...,competitor bootcamp are closing. Is the model ...,Competitor Bootcamps Are Closing. Is the Model...


In [38]:
news_df.sample()

Unnamed: 0,title,content,category
11,India announced triumph over COVID-19 early: U...,Confederation of Indian Industry (CII) Preside...,business


In [41]:
def work_together(df):
    '''Takes in specified dataframe
    renames the content column to original
    creates clean column that take sin each series in original 
        and cleans based on basic_clean funciton
    create stemmed column that takes in each series in cleab
        and gives back a stemmed string based on stem funciton
    create lemmatized column that take sin each series in clean
        and gives back a lemma string based on lemma fucniton
    returns new df'''
    # rename content to original
    df = df.rename(columns={'content':'original'})
    # make the clean column using orignal
    df['clean'] = pd.Series([basic_clean(string) for string in df.original])
    # make stemmed column using clean
    df['stemmed'] = pd.Series([stem(string) for string in df.clean])
    # make lemma column using clean
    df['lemmatized'] = pd.Series([lemmatize(string) for string in df.clean])
    # return new dataframe
    return df

In [42]:
work_together(codeup_df)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...,the rumors are true the time has arrived codeu...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...,by dimitri antoniou and maggie giust\ndata sci...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",by dimitri antoniou\na week ago codeup launche...,by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair\nthe third biannual san anton...,sa tech job fair the third biannual san antoni...,sa tech job fair the third biannual san antoni...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps are closing is the model ...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...


In [43]:
work_together(news_df)

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",business,speaking about india's second covid19 wave for...,speak about india' second covid19 wave former ...,speaking about india's second covid19 wave for...
1,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,business,indian commercial pilots association icpa on t...,indian commerci pilot associ icpa on tuesday s...,indian commercial pilot association icpa on tu...
2,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,business,south koreas richest woman hong rahee added an...,south korea richest woman hong rahe ad anoth 7...,south korea richest woman hong rahee added ano...
3,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",business,pandora the world's biggest jeweller has said ...,pandora the world' biggest jewel ha said that ...,pandora the world's biggest jeweller ha said t...
4,IndiGo commences company vaccination drive for...,IndiGo Airlines' Chief HR Officer Raj Raghavan...,business,indigo airlines' chief hr officer raj raghavan...,indigo airlines' chief hr offic raj raghavan s...,indigo airlines' chief hr officer raj raghavan...
...,...,...,...,...,...,...
142,Could abandon Myanmar project if found to viol...,Adani Ports and Special Economic Zone (APSEZ) ...,world,adani ports and special economic zone apsez sa...,adani port and special econom zone apsez said ...,adani port and special economic zone apsez sai...
143,Myanmar's military govt bans satellite TV citi...,Myanmar's military government has announced a ...,world,myanmar's military government has announced a ...,myanmar' militari govern ha announc a ban on s...,myanmar's military government ha announced a b...
144,"US, Japan agree to unite in response to China:...",The United States and Japan have agreed to ste...,world,the united states and japan have agreed to ste...,the unit state and japan have agre to step up ...,the united state and japan have agreed to step...
145,Chinese Navy to help Indonesia salvage sunken ...,Chinese Navy ships have arrived in Indonesia t...,world,chinese navy ships have arrived in indonesia t...,chines navi ship have arriv in indonesia to he...,chinese navy ship have arrived in indonesia to...


9. ask yourself:
- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - lemma
-  If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - stem
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    - stem