In [1]:
import pandas as pd
import numpy as np
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
import aquire

In [44]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

In [3]:
def basic_clean(the_string):
    the_string = the_string.lower()
    the_string = unicodedata.normalize('NFKD', the_string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8')
    the_string = re.sub(r"[^a-z0-9\s]", '', the_string)
    return the_string

In [4]:
the_string = basic_clean(original)
print(the_string)

paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdoss name contains the hungarian letter o o with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity


Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [5]:
def tokenize(the_string):
    tokenize = nltk.tokenize.ToktokTokenizer()
    the_string = tokenize.tokenize(the_string, return_str = True)
    return the_string

In [6]:
tokenize(original)

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős ' s name contains the Hungarian letter ' ő ' ( ' o ' with double acute accent ) , but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [7]:
def stem(the_string):
    ps = nltk.porter.PorterStemmer()
    return ps.stem(the_string)

In [8]:
stem(original)

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necess"

Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [9]:
def lemmatize(the_string):
    wnl = nltk.stem.WordNetLemmatizer()
    return wnl.lemmatize(the_string)

In [10]:
lemmatize(original)

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.


In [61]:
def remove_stopwords(the_string, extra_words = [], exclude_words = []):
    stopwords_ls = stopwords.words('english')
    
    stopwords_ls = set(stopwords_ls) - set(exclude_words)
    stopwords_ls = list(stopwords_ls)
    
    if len(extra_words) > 1:
        stopwords_ls.extend(extra_words)
    elif len(extra_words) == 1:
        stopwords_ls.append(extra_words[0])
        
        
    words = the_string.split()
    filtered = [word for word in words if word not in stopwords_ls]
    return ' '.join(filtered)

Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [12]:
codeup_df = aquire.codeup_blog_posts()
codeup_df = codeup_df.rename(columns = {'content': 'original'})
codeup_df.head()

Unnamed: 0,original,title
0,['May is traditionally known as Asian American...,Spotlight on APIDA Voices: Celebrating Heritag...
1,['Codeup is hosting a Women in Tech Panel in h...,Women in tech: Panelist Spotlight – Magdalena ...
2,['Codeup is hosting a Women in Tech Panel in h...,Women in tech: Panelist Spotlight – Rachel Rob...
3,['Codeup is hosting a Women in Tech Panel in h...,Women in Tech: Panelist Spotlight – Sarah Mellor
4,['Codeup is hosting a Women in Tech Panel in h...,Women in Tech: Panelist Spotlight – Madeleine ...


In [13]:
cleaned_original = codeup_df['original'].apply(basic_clean)
codeup_df['clean'] = cleaned_original
tokenized_original = codeup_df['clean'].apply(tokenize)
codeup_df['clean'] = tokenizedb_original

In [14]:
codeup_df['stemmed'] = codeup_df['clean'].apply(stem)

In [15]:
codeup_df['lemmatized'] = codeup_df['clean'].apply(lemmatize)

In [16]:
codeup_df

Unnamed: 0,original,title,clean,stemmed,lemmatized
0,['May is traditionally known as Asian American...,Spotlight on APIDA Voices: Celebrating Heritag...,may is traditionally known as asian american a...,may is traditionally known as asian american a...,may is traditionally known as asian american a...
1,['Codeup is hosting a Women in Tech Panel in h...,Women in tech: Panelist Spotlight – Magdalena ...,codeup is hosting a women in tech panel in hon...,codeup is hosting a women in tech panel in hon...,codeup is hosting a women in tech panel in hon...
2,['Codeup is hosting a Women in Tech Panel in h...,Women in tech: Panelist Spotlight – Rachel Rob...,codeup is hosting a women in tech panel in hon...,codeup is hosting a women in tech panel in hon...,codeup is hosting a women in tech panel in hon...
3,['Codeup is hosting a Women in Tech Panel in h...,Women in Tech: Panelist Spotlight – Sarah Mellor,codeup is hosting a women in tech panel in hon...,codeup is hosting a women in tech panel in hon...,codeup is hosting a women in tech panel in hon...
4,['Codeup is hosting a Women in Tech Panel in h...,Women in Tech: Panelist Spotlight – Madeleine ...,codeup is hosting a women in tech panel in hon...,codeup is hosting a women in tech panel in hon...,codeup is hosting a women in tech panel in hon...
5,"['', 'Codeup is hosting a Black Excellence in ...",Black Excellence in Tech: Panelist Spotlight –...,codeup is hosting a black excellence in tech p...,codeup is hosting a black excellence in tech p...,codeup is hosting a black excellence in tech p...


In [17]:
news_df = aquire.get_news_articles()
news_df = news_df.rename(columns = {'content': 'original'})
news_df.head()

Unnamed: 0,title,original,catagory
0,Disrespect of Gods won't be tolerated: Union M...,"Amid a controversy over the movie 'Adipurush',...",Entertainment
1,'Jalegi tere baap ki' dialogue revised in 'Adi...,'Adipurush' makers have revised Lord Hanuman's...,Entertainment
2,Was told I'll crack auditions if I wear short ...,'Bigg Boss 16' fame Archana Gautam revisited h...,Entertainment
3,'Adipurush' makers reduce ticket prices for 3D...,'Adipurush' makers on Wednesday announced that...,Entertainment
4,"Asit Modi would pinch my cheeks, say inappropr...",The details of FIR filed against 'Taarak Mehta...,Entertainment


In [21]:
cleaned_original = news_df['original'].apply(basic_clean)
news_df['clean'] = cleaned_original
tokenized_original = news_df['clean'].apply(tokenize)
news_df['clean'] = tokenized_original
news_df['stemmed'] = news_df['clean'].apply(stem)
news_df['lemmatized'] = news_df['clean'].apply(lemmatize)
news_df.head()

Unnamed: 0,title,original,catagory,clean,stemmed,lemmatized
0,Disrespect of Gods won't be tolerated: Union M...,"Amid a controversy over the movie 'Adipurush',...",Entertainment,amid a controversy over the movie adipurush un...,amid a controversy over the movie adipurush un...,amid a controversy over the movie adipurush un...
1,'Jalegi tere baap ki' dialogue revised in 'Adi...,'Adipurush' makers have revised Lord Hanuman's...,Entertainment,adipurush makers have revised lord hanumans di...,adipurush makers have revised lord hanumans di...,adipurush makers have revised lord hanumans di...
2,Was told I'll crack auditions if I wear short ...,'Bigg Boss 16' fame Archana Gautam revisited h...,Entertainment,bigg boss 16 fame archana gautam revisited her...,bigg boss 16 fame archana gautam revisited her...,bigg boss 16 fame archana gautam revisited her...
3,'Adipurush' makers reduce ticket prices for 3D...,'Adipurush' makers on Wednesday announced that...,Entertainment,adipurush makers on wednesday announced that a...,adipurush makers on wednesday announced that a...,adipurush makers on wednesday announced that a...
4,"Asit Modi would pinch my cheeks, say inappropr...",The details of FIR filed against 'Taarak Mehta...,Entertainment,the details of fir filed against taarak mehta ...,the details of fir filed against taarak mehta ...,the details of fir filed against taarak mehta ...


Ask yourself:


If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?

lametized

If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?

lametized

If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

stemmed

In [55]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [57]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [56]:
remove_stopwords(original)

<class 'list'>
[]


"Paul Erdős George Pólya influential Hungarian mathematicians contributed lot field. Erdős's name contains Hungarian letter 'ő' ('o' double acute accent), often incorrectly written Erdos Erdös either mistake typographical necessity"

In [59]:
remove_stopwords(original,  exclude_words = ['out', 'to', 'were'])

<class 'list'>
['out', 'to', 'were']


"Paul Erdős George Pólya were influential Hungarian mathematicians contributed lot to field. Erdős's name contains Hungarian letter 'ő' ('o' double acute accent), often incorrectly written Erdos Erdös either mistake out typographical necessity"

In [62]:
remove_stopwords(original,  extra_words = ['George','Paul'])

<class 'list'>
[]


"Erdős Pólya influential Hungarian mathematicians contributed lot field. Erdős's name contains Hungarian letter 'ő' ('o' double acute accent), often incorrectly written Erdos Erdös either mistake typographical necessity"