## Imports

In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as a

## Acquire the data

In [2]:
urls = a.get_blog_urls()
articles = a.get_news_articles()

---

## Exercises
 Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [3]:
string = articles.content[0]
string

"The rupee hit a record low of 79.97 against the US dollar on Monday after opening at 79.76. The Finance Ministry, while speaking about the matter said that global factors such as the Russia-Ukraine war, soaring crude oil prices and tightening of global financial conditions are the major reasons for the rupee's weakening."

In [4]:
# creating the function
def basic_clean(string):
    
    # lowercase everything
    string = string.lower()
    
    # remove inconsistenceis
    # encode into ascii byte strings
    # decode back into UTF-8
    # (This process will normalize the unicode characters)
    
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('UTF-8')
    
    # replace anything that is not a letter, number, whitespace, etc
    # use regex to perform this operation
    string = re.sub(r"[^a-z0-9\s]", '', string)
    
    return string

In [5]:
cleaned_string = basic_clean(string)
cleaned_string

'the rupee hit a record low of 7997 against the us dollar on monday after opening at 7976 the finance ministry while speaking about the matter said that global factors such as the russiaukraine war soaring crude oil prices and tightening of global financial conditions are the major reasons for the rupees weakening'

---

Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [6]:
def tokenize(string):
    """
    This function will take in a string, tokenize the string and 
    return the tokenize string
    """
    
    #create the token
    token = nltk.tokenize.ToktokTokenizer()
    
    #Use the token
    string = token.tokenize(string,  return_str=True)
    
    return string


In [7]:
token = tokenize(cleaned_string)
token

'the rupee hit a record low of 7997 against the us dollar on monday after opening at 7976 the finance ministry while speaking about the matter said that global factors such as the russiaukraine war soaring crude oil prices and tightening of global financial conditions are the major reasons for the rupees weakening'

---

Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [8]:
def stem(string):
    """
    This function will accept some text(string) and return a stemmed 
    version of the text
    """
    
    #create the porter stem
    ps = nltk.porter.PorterStemmer()
    
    #Apply the stem to each work in the string and create a list
    # of steemed words
    
    stem = [ps.stem(word) for word in string.split()]
    
    # rejoin the string together
    stemmed_string = ' '.join(stem)
    
    return stemmed_string

In [9]:
#test
stemmed_string = stem(token)
stemmed_string

'the rupe hit a record low of 7997 against the us dollar on monday after open at 7976 the financ ministri while speak about the matter said that global factor such as the russiaukrain war soar crude oil price and tighten of global financi condit are the major reason for the rupe weaken'

---

Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [10]:
# download wornet lemmatized
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/deangelobowen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
def lemmatize(string):
    """This function takes in a string and returns a lmeeatized 
    version of the string"""
    
    # create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    string_lemmatize = ' '.join(lemmas)
    
    return string_lemmatize

In [12]:
lemmatized = lemmatize(token)
lemmatized

'the rupee hit a record low of 7997 against the u dollar on monday after opening at 7976 the finance ministry while speaking about the matter said that global factor such a the russiaukraine war soaring crude oil price and tightening of global financial condition are the major reason for the rupee weakening'

---

Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [13]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    """
    This function will take in a string, filter out stop words from the nltk standard english list 
    as well as any other extra words, and return a version of the text without these stopwords.
    It includes optional paramaters allowing the user to add extra words to remove 
    or to exclude words from the stopword list.
    """
    
    #get english stopwords from nltk
    stop_words = stopwords.words('english')
    
    #Add extra words to be removed to the stop word list
    for word in extra_words:
        stop_words.append(word)
    
    #Remove words to be excluded from the stop word list
    for word in exclude_words:
        stop_words.remove(word)
    
    #Create a list of words to be checked by splitting the string
    words = string.split()
    
    #Filter out all of the stop words
    filtered_words = [word for word in words if word not in stop_words]
    
    #Join the list of filtered words into a string
    filtered_string = ' '.join(filtered_words)
    
    return filtered_string


In [14]:
removed_stopwords = remove_stopwords(token)
removed_stopwords

'rupee hit record low 7997 us dollar monday opening 7976 finance ministry speaking matter said global factors russiaukraine war soaring crude oil prices tightening global financial conditions major reasons rupees weakening'

In [15]:
# Test using extra_words and exclude_words options
extra_words = ['russia', 'matter']
exclude_words = ['the']

filters = remove_stopwords(token, extra_words, exclude_words)
filters

'the rupee hit record low 7997 the us dollar monday opening 7976 the finance ministry speaking the said global factors the russiaukraine war soaring crude oil prices tightening global financial conditions the major reasons the rupees weakening'

---

Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [16]:
news_df = a.get_news_articles()
news_df

Unnamed: 0,category,title,author,date,content
0,business,Rupee hits record low of 79.97 against US dollar,Ridham Gambhir,2022-07-18,The rupee hit a record low of 79.97 against th...
1,business,Rupee closes at an all-time low of 79.98 again...,Ridham Gambhir,2022-07-18,The rupee on Monday hit a fresh record low as ...
2,business,"BCCI had ₹40 cr in bank when I joined & ₹47,68...",Ridham Gambhir,2022-07-17,"In an Instagram post, Lalit Modi asserted that..."
3,business,A fighter to the core: Mahindra praises PV Sin...,Ridham Gambhir,2022-07-17,Businessman Anand Mahindra took to Twitter to ...
4,business,It is not a sacrifice at all: Bill Gates on pl...,Hiral Goyal,2022-07-17,"Microsoft Co-founder Bill Gates, who plans to ..."
...,...,...,...,...,...
95,entertainment,Excited to announce my next film starring Kart...,Amartya Sharma,2022-07-18,Filmmaker Kabir Khan took to Instagram to anno...
96,entertainment,8-yr-old Aditya Patil wins Dance Deewane Junio...,Udit Gupta,2022-07-18,Aditya Patil has won the first season of the t...
97,entertainment,Grace & style of Dhanush is something to behol...,Amartya Sharma,2022-07-18,"Regé-Jean Page, speaking about Dhanush in 'The..."
98,entertainment,"Rajesh Khanna was very generous, he bought hou...",Udit Gupta,2022-07-18,"Veteran actress Sharmila Tagore, who appeared ..."


Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [17]:
codeup_df = a.get_blog_articles()
codeup_df

Using cached .json file


Unnamed: 0,title,date,category,content
0,In-Person Workshop: Learn to Code – Python on ...,2022-06-20,San Antonio,"According to LinkedIn, the “#1 Most Promising ..."
1,Free JavaScript Workshop at Codeup Dallas on 6/28,2022-06-19,Dallas,Event Info: Location – Codeup Dallas Time – 6...
2,Is Our Cloud Administration Program Right for ...,2022-06-08,Featured,Changing careers can be scary. The first thing...
3,PRIDE in Tech Panel,2022-06-05,Dallas,"In celebration of PRIDE month, join our Codeup..."
4,Inclusion at Codeup During Pride Month (and Al...,2022-06-01,Codeup News,Happy Pride Month! Pride Month is a dedicated ...
5,Mental Health First Aid Training,2022-05-31,Tips for Prospective Students,"As a student of Codeup, going through a massiv..."
6,Codeup Dallas: How to Succeed at a Coding Boot...,2022-05-23,Dallas,This event is the perfect opportunity for peop...
7,5 Reasons To Attend Our New Cloud Administrati...,2022-05-17,Codeup News,Come Work In The Cloud When your Monday rolls ...
8,Learn to Code: Python on 5/21,2022-05-16,Events,"According to LinkedIn, the “#1 Most Promising ..."
9,Codeup Dallas Joins Career Day,2022-05-16,Codeup News,Last week our Codeup Dallas team participated ...


---

For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [18]:
# Rename the content columns to original 
news_df.rename(columns = {'content':'original'}, inplace = True)
codeup_df.rename(columns = {'content':'original'}, inplace = True)

In [19]:
## Starting with the news_df

news_df['clean'] = news_df['original']

#apply the basic_clean, tokenize, and remove_stopwords functions
news_df['clean'] = news_df['clean'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)

#create the stemmed column
news_df['stemmed'] = news_df['clean']

#apply the stem function
news_df['stemmed'] = news_df['stemmed'].apply(stem)

#create the lematize column
news_df['lemmatized'] = news_df['clean']

#apply the lemmatize function
news_df['lemmatized'] = news_df['lemmatized'].apply(lemmatize)

news_df

Unnamed: 0,category,title,author,date,original,clean,stemmed,lemmatized
0,business,Rupee hits record low of 79.97 against US dollar,Ridham Gambhir,2022-07-18,The rupee hit a record low of 79.97 against th...,rupee hit record low 7997 us dollar monday ope...,rupe hit record low 7997 us dollar monday open...,rupee hit record low 7997 u dollar monday open...
1,business,Rupee closes at an all-time low of 79.98 again...,Ridham Gambhir,2022-07-18,The rupee on Monday hit a fresh record low as ...,rupee monday hit fresh record low ended closer...,rupe monday hit fresh record low end closer 80...,rupee monday hit fresh record low ended closer...
2,business,"BCCI had ₹40 cr in bank when I joined & ₹47,68...",Ridham Gambhir,2022-07-17,"In an Instagram post, Lalit Modi asserted that...",instagram post lalit modi asserted neither too...,instagram post lalit modi assert neither took ...,instagram post lalit modi asserted neither too...
3,business,A fighter to the core: Mahindra praises PV Sin...,Ridham Gambhir,2022-07-17,Businessman Anand Mahindra took to Twitter to ...,businessman anand mahindra took twitter praise...,businessman anand mahindra took twitter prais ...,businessman anand mahindra took twitter praise...
4,business,It is not a sacrifice at all: Bill Gates on pl...,Hiral Goyal,2022-07-17,"Microsoft Co-founder Bill Gates, who plans to ...",microsoft cofounder bill gates plans give away...,microsoft cofound bill gate plan give away wea...,microsoft cofounder bill gate plan give away w...
...,...,...,...,...,...,...,...,...
95,entertainment,Excited to announce my next film starring Kart...,Amartya Sharma,2022-07-18,Filmmaker Kabir Khan took to Instagram to anno...,filmmaker kabir khan took instagram announce u...,filmmak kabir khan took instagram announc upco...,filmmaker kabir khan took instagram announce u...
96,entertainment,8-yr-old Aditya Patil wins Dance Deewane Junio...,Udit Gupta,2022-07-18,Aditya Patil has won the first season of the t...,aditya patil first season talent reality show ...,aditya patil first season talent realiti show ...,aditya patil first season talent reality show ...
97,entertainment,Grace & style of Dhanush is something to behol...,Amartya Sharma,2022-07-18,"Regé-Jean Page, speaking about Dhanush in 'The...",regejean page speaking dhanush gray man said g...,regejean page speak dhanush gray man said grac...,regejean page speaking dhanush gray man said g...
98,entertainment,"Rajesh Khanna was very generous, he bought hou...",Udit Gupta,2022-07-18,"Veteran actress Sharmila Tagore, who appeared ...",veteran actress sharmila tagore appeared rajes...,veteran actress sharmila tagor appear rajesh k...,veteran actress sharmila tagore appeared rajes...


In [20]:
## Now apply the same to the codeup_df

codeup_df['clean'] = codeup_df['original']

#apply the basic_clean, tokenize, and remove_stopwords functions
codeup_df['clean'] = codeup_df['clean'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)

#create the stemmed column
codeup_df['stemmed'] = codeup_df['clean']

#apply the stem function
codeup_df['stemmed'] = codeup_df['stemmed'].apply(stem)

#create the lematize column
codeup_df['lemmatized'] = codeup_df['clean']

#apply the lemmatize function
codeup_df['lemmatized'] = codeup_df['lemmatized'].apply(lemmatize)

codeup_df

Unnamed: 0,title,date,category,original,clean,stemmed,lemmatized
0,In-Person Workshop: Learn to Code – Python on ...,2022-06-20,San Antonio,"According to LinkedIn, the “#1 Most Promising ...",according linkedin 1 promising job data scienc...,accord linkedin 1 promis job data scienc one m...,according linkedin 1 promising job data scienc...
1,Free JavaScript Workshop at Codeup Dallas on 6/28,2022-06-19,Dallas,Event Info: Location – Codeup Dallas Time – 6...,event info location codeup dallas time 6 pm co...,event info locat codeup dalla time 6 pm come l...,event info location codeup dallas time 6 pm co...
2,Is Our Cloud Administration Program Right for ...,2022-06-08,Featured,Changing careers can be scary. The first thing...,changing careers scary first thing may asking ...,chang career scari first thing may ask begin l...,changing career scary first thing may asking b...
3,PRIDE in Tech Panel,2022-06-05,Dallas,"In celebration of PRIDE month, join our Codeup...",celebration pride month join codeup alumni lgb...,celebr pride month join codeup alumni lgbtqia ...,celebration pride month join codeup alumnus lg...
4,Inclusion at Codeup During Pride Month (and Al...,2022-06-01,Codeup News,Happy Pride Month! Pride Month is a dedicated ...,happy pride month pride month dedicated time c...,happi pride month pride month dedic time celeb...,happy pride month pride month dedicated time c...
5,Mental Health First Aid Training,2022-05-31,Tips for Prospective Students,"As a student of Codeup, going through a massiv...",student codeup going massive career transition...,student codeup go massiv career transit mental...,student codeup going massive career transition...
6,Codeup Dallas: How to Succeed at a Coding Boot...,2022-05-23,Dallas,This event is the perfect opportunity for peop...,event perfect opportunity people wondering exp...,event perfect opportun peopl wonder expect cod...,event perfect opportunity people wondering exp...
7,5 Reasons To Attend Our New Cloud Administrati...,2022-05-17,Codeup News,Come Work In The Cloud When your Monday rolls ...,come work cloud monday rolls around start get ...,come work cloud monday roll around start get s...,come work cloud monday roll around start get s...
8,Learn to Code: Python on 5/21,2022-05-16,Events,"According to LinkedIn, the “#1 Most Promising ...",according linkedin 1 promising job data scienc...,accord linkedin 1 promis job data scienc one m...,according linkedin 1 promising job data scienc...
9,Codeup Dallas Joins Career Day,2022-05-16,Codeup News,Last week our Codeup Dallas team participated ...,last week codeup dallas team participated care...,last week codeup dalla team particip career da...,last week codeup dallas team participated care...
