In [2]:
#standard imports
import pandas as pd
import numpy as np

#parsing data
import re
import unicodedata
import nltk
from nltk.corpus import stopwords

#my acquire
import acquire

In [4]:
article = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

### Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [5]:
def basic_clean(string):
    
    string = string.lower() #lowercasing
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8') #normalizing
    string = re.sub(r'[^a-z0-9\'\s]', '', string) #replace extra things
    
    return string

In [7]:
article = basic_clean(article)
article

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [8]:
def tokenize(string):
    
    tokenize = nltk.tokenize.ToktokTokenizer() #creating the tokenize
    string = tokenize.tokenize(string, return_str=True) #using the tokenize
    
    return string

In [11]:
article = tokenize(article)
article

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [12]:
def stem(string):
    
    ps = nltk.porter.PorterStemmer() #creating my stemmer
    stems = [ps.stem(word) for word in string.split()] #splitting into each word and applying the stemmer
    string = ' '.join(stems) #joining all into one string
    
    return string

In [13]:
stem(article)

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [14]:
def lemmatize(string):
    
    wnl = nltk.stem.WordNetLemmatizer() #creating my lemmatizer
    lemmas = [wnl.lemmatize(word) for word in string.split()] #splitting my string into words and applying the lemma
    string = ' '.join(lemmas) #joining back into one string

    return string

In [15]:
lemmatize(article)

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

### Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords. This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [29]:
list1 = [1,2,3,4,5]
list2 = [5,9,10]

In [30]:
set(list1) - set(list2)

{1, 2, 3, 4}

In [31]:
set(list1).union(list2)

{1, 2, 3, 4, 5, 9, 10}

In [60]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    
    stopwords_ls = stopwords.words('english') #defining my stopwords
    
    stopwords_ls = set(stopwords_ls) - set(exclude_words) #removing any stopwords in my exclude list
    stopwords_ls = stopwords_ls.union(set(extra_words)) #adding any stopwards from my extra list
    
    words = string.split() #splitting up my string
    filtered_words = [word for word in words if word not in stopwords_ls] #use listcomp to remove words in stopwords_ls
    string = ' '.join(filtered_words) #joining back to a string
    
    return string

### Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [34]:
articles = acquire.get_news_articles()

pulling articles from https://www.inshorts.com/en/read/business
pulling articles from https://www.inshorts.com/en/read/sports
pulling articles from https://www.inshorts.com/en/read/technology
pulling articles from https://www.inshorts.com/en/read/entertainment


In [49]:
news_df = pd.DataFrame(articles)
news_df

Unnamed: 0,title,content,category
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,business
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,business
2,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,business
3,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...,business
4,"Loves India, is a fan of PM: Paytm Founder on ...",Paytm Founder Vijay Shekhar Sharma shared a vi...,business
...,...,...,...
95,"Aamir remembered my 'Laal...' audition, I was ...",Actress Sonam Bajwa recalled that when she had...,entertainment
96,"When Prateik smiles, he looks exactly like Smi...","Actress Shabana Azmi said it was ""deeply emoti...",entertainment
97,"Thought kids at school would laugh at me, said...",Jugal Hansraj said that he had initially rejec...,entertainment
98,"Parents said 'Even cats, dogs are on TV, when ...","Nawazuddin Siddiqui, while talking about his r...",entertainment


### Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [40]:
blogs = acquire.get_blob_articles()

pulling content from https://codeup.com/data-science/math-in-data-science/
pulling content from https://codeup.com/codeup-news/dei-report/
pulling content from https://codeup.com/tips-for-prospective-students/tips-for-women/
pulling content from https://codeup.com/data-science/jobs-after-a-coding-bootcamp-part-1-data-science/
pulling content from https://codeup.com/data-science/why-you-should-become-a-data-scientist/
pulling content from https://codeup.com/featured/financing-career-transition/


In [43]:
codeup_df = pd.DataFrame(blogs)
codeup_df

Unnamed: 0,title,date_published,content
0,What are the Math and Stats Principles You Nee...,"Oct 21, 2020","Coming into our Data Science program, you will..."
1,Diversity Equity and Inclusion Report,"Oct 7, 2022",Codeup is excited to launch our first Diversit...
2,Tips for Women Beginning a Career in Tech,"Sep 23, 2022","Codeup strongly values diversity, and inclusio..."
3,What Jobs Can You Get After a Coding Bootcamp?...,"Jul 7, 2022",If you are interested in embarking on a career...
4,Why You Should Become a Data Scientist,"Mar 11, 2021","What do you look for in a career? Chances are,..."
5,How Can I Finance My Career Transition?,"Sep 29, 2022",Deciding to transition into a tech career is a...


### For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [50]:
news_df = news_df.rename(columns={'content':'original'}).drop(columns='category')
news_df.head()

Unnamed: 0,title,original
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...
2,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...
3,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...
4,"Loves India, is a fan of PM: Paytm Founder on ...",Paytm Founder Vijay Shekhar Sharma shared a vi...


In [67]:
news_df['clean'] = news_df.original.apply(basic_clean).apply(tokenize).apply(remove_stopwords)

In [68]:
news_df['stem'] = news_df.clean.apply(stem)
news_df['lemma'] = news_df.clean.apply(lemmatize)

In [69]:
news_df

Unnamed: 0,title,original,clean,stem,lemma
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,benchmark indices sensex nifty ended record cl...,benchmark indic sensex nifti end record close ...,benchmark index sensex nifty ended record clos...
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,us federal trade commission ftc sued amazon ac...,us feder trade commiss ftc su amazon accus tri...,u federal trade commission ftc sued amazon acc...
2,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,time magazine released annual list world ' 100...,time magazin releas annual list world ' 100 in...,time magazine released annual list world ' 100...
3,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...,singapore airlines world ' best airline accord...,singapor airlin world ' best airlin accord sky...,singapore airline world ' best airline accordi...
4,"Loves India, is a fan of PM: Paytm Founder on ...",Paytm Founder Vijay Shekhar Sharma shared a vi...,paytm founder vijay shekhar sharma shared vide...,paytm founder vijay shekhar sharma share video...,paytm founder vijay shekhar sharma shared vide...
...,...,...,...,...,...
95,"Aamir remembered my 'Laal...' audition, I was ...",Actress Sonam Bajwa recalled that when she had...,actress sonam bajwa recalled met aamir khan tr...,actress sonam bajwa recal met aamir khan trail...,actress sonam bajwa recalled met aamir khan tr...
96,"When Prateik smiles, he looks exactly like Smi...","Actress Shabana Azmi said it was ""deeply emoti...",actress shabana azmi said deeply emotional sho...,actress shabana azmi said deepli emot shoot sm...,actress shabana azmi said deeply emotional sho...
97,"Thought kids at school would laugh at me, said...",Jugal Hansraj said that he had initially rejec...,jugal hansraj said initially rejected shekhar ...,jugal hansraj said initi reject shekhar kapur ...,jugal hansraj said initially rejected shekhar ...
98,"Parents said 'Even cats, dogs are on TV, when ...","Nawazuddin Siddiqui, while talking about his r...",nawazuddin siddiqui talking role junior artist...,nawazuddin siddiqui talk role junior artist ' ...,nawazuddin siddiqui talking role junior artist...


In [70]:
def clean_df(df, extra_words=[], exclude_words=[]):
    """
    Send in df with columns: title and original,
    returns df with original, clean, stemmed, and lemmatized data
    """
    df['clean'] = df.original\
                        .apply(basic_clean)\
                        .apply(tokenize)\
                        .apply(remove_stopwords, 
                                    extra_words=extra_words,
                                    exclude_words=exclude_words)
    df['stemmed'] = df.clean.apply(stem)
    df['lemmatized'] = df.clean.apply(lemmatize)
    
    return df

In [77]:
codeup_df = pd.DataFrame(blogs)
codeup_df

Unnamed: 0,title,date_published,content
0,What are the Math and Stats Principles You Nee...,"Oct 21, 2020","Coming into our Data Science program, you will..."
1,Diversity Equity and Inclusion Report,"Oct 7, 2022",Codeup is excited to launch our first Diversit...
2,Tips for Women Beginning a Career in Tech,"Sep 23, 2022","Codeup strongly values diversity, and inclusio..."
3,What Jobs Can You Get After a Coding Bootcamp?...,"Jul 7, 2022",If you are interested in embarking on a career...
4,Why You Should Become a Data Scientist,"Mar 11, 2021","What do you look for in a career? Chances are,..."
5,How Can I Finance My Career Transition?,"Sep 29, 2022",Deciding to transition into a tech career is a...


In [78]:
codeup_df = codeup_df.rename(columns={'content':'original'}).drop(columns='date_published')

In [79]:
codeup_df

Unnamed: 0,title,original
0,What are the Math and Stats Principles You Nee...,"Coming into our Data Science program, you will..."
1,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...
2,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio..."
3,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
4,Why You Should Become a Data Scientist,"What do you look for in a career? Chances are,..."
5,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...


In [80]:
clean_df(codeup_df)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,What are the Math and Stats Principles You Nee...,"Coming into our Data Science program, you will...",coming data science program need know math sta...,come data scienc program need know math stat h...,coming data science program need know math sta...
1,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...,codeup excited launch first diversity equity i...,codeup excit launch first divers equiti inclus...,codeup excited launch first diversity equity i...
2,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio...",codeup strongly values diversity inclusion hon...,codeup strongli valu divers inclus honor ameri...,codeup strongly value diversity inclusion hono...
3,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...,interested embarking career tech youre probabl...,interest embark career tech your probabl wonde...,interested embarking career tech youre probabl...
4,Why You Should Become a Data Scientist,"What do you look for in a career? Chances are,...",look career chances youre looking way make use...,look career chanc your look way make use parti...,look career chance youre looking way make use ...
5,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...,deciding transition tech career big step signi...,decid transit tech career big step signific co...,deciding transition tech career big step signi...


### Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?