In [1]:
import pandas as pd
import numpy as np
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
import prepare
import acquire

from requests import get
from bs4 import BeautifulSoup
import os


# Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

Lowercase everything
Normalize unicode characters
Replace anything that is not a letter, number, whitespace or a single quote.
Define a function named tokenize. It should take in a string and tokenize all the words in the string.

Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove

In [2]:
def basic_clean(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    text = re.sub(r'[^a-z0-9\'\s]', '', text)
    return text

In [3]:
def tokenize(text):
    tokenize = nltk.tokenize.ToktokTokenizer()
    text = tokenize.tokenize(text, return_str=True)
    return text

In [4]:
def stem(text):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    stemmed = ' '.join(stems)
    return stemmed    

In [5]:
def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lems = [wnl.lemmatize(word) for word in text.split()]
    lemmatized = ' '.join(lems)
    return lemmatized

In [16]:
def remove_stopwords(text, extra_words=[], exclude_words=[]):
    stopwords_ls = stopwords.words('english')
    stopwords_ls = stopwords_ls + extra_words
    [stopwords_ls.remove(word) for word in exclude_words]
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords_ls]
    stopped = ' '.join(filtered_words)
    return stopped

In [7]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [9]:
clean = basic_clean(original)
clean

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [11]:
tokened = tokenize(clean)
tokened

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [12]:
stemmed = stem(tokened)
stemmed

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

In [14]:
lemmatized = lemmatize(tokened)
lemmatized

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

In [18]:
remove_stopwords(lemmatized)

"paul erdos george polya influential hungarian mathematician contributed lot field erdos ' name contains hungarian letter ' ' ' ' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

In [23]:
extra_words=['double']
exclude_words=['the', 'o']

In [24]:
remove_stopwords(lemmatized, extra_words, exclude_words)

"paul erdos george polya influential hungarian mathematician contributed lot the field erdos ' name contains the hungarian letter ' o ' ' o ' acute accent often incorrectly written erdos erdos either mistake typographical necessity"

In [2]:
news_df = acquire.get_news_articles()

In [3]:
news_df

Unnamed: 0,title,content,category
0,"British artist live-streams burning 1,000 of h...","British artist Damien Hirst burnt 1,000 painti...",technology
1,"HCL disapproves dual employment, says moonligh...","Speaking on the issue of moonlighting, HCL Tec...",technology
2,"Infosys' attrition drops to 27.1%, net employe...",Infosys on Thursday reported a 1.3% QoQ drop i...,technology
3,We do not support dual employment: Infosys on ...,Infosys CEO Salil Parekh spoke on the moonligh...,technology
4,"Facebook users complain of losing followers, Z...",Several Facebook users have complained about l...,technology
...,...,...,...
95,Love the kind of thriller stories Sriram Ragha...,Actress Katrina Kaif said she loves the kind o...,entertainment
96,Son Ram Charan & I returned 80% fee: Chiranjee...,Actor Chiranjeevi has revealed that he and his...,entertainment
97,Admired Emma in a way I could never explain: '...,"'Harry Potter' actor Tom Felton said he ""loved...",entertainment
98,Emily Ratajkowski says she's 'single' amid rel...,Supermodel Emily Ratajkowski has confirmed tha...,entertainment


In [4]:
codeup_df = acquire.get_blog_articles()

In [5]:
codeup_df

Unnamed: 0,title,content
0,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...
1,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...
2,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio..."
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...
5,2022 SABJ C-Suite Award Winner: Stephen Noteboom,"Codeup’s Chief Operating Officer, Stephen Note..."


# For each dataframe, produce the following columns:

    title to hold the title

    original to hold the original article/post content

    clean to hold the normalized and tokenized original with the stopwords removed.

    stemmed to hold the stemmed version of the cleaned data.

    lemmatized to hold the lemmatized version of the cleaned data.

In [38]:
news_df['clean'] = news_df.content.apply(lambda x: prepare.basic_clean(x)).apply(lambda x: prepare.tokenize(x)).apply(lambda x: prepare.remove_stopwords(x))
news_df['stemmed'] = news_df.clean.apply(lambda x: prepare.stem(x))
news_df['lemmatized'] = news_df.clean.apply(lambda x: prepare.lemmatize(x))
news_df

Unnamed: 0,title,content,category,clean,stemmed,lemmatized
0,"British artist live-streams burning 1,000 of h...","British artist Damien Hirst burnt 1,000 painti...",technology,british artist damien hirst burnt 1000 paintin...,british artist damien hirst burnt 1000 paint p...,british artist damien hirst burnt 1000 paintin...
1,"HCL disapproves dual employment, says moonligh...","Speaking on the issue of moonlighting, HCL Tec...",technology,speaking issue moonlighting hcl technologies s...,speak issu moonlight hcl technolog said doesnt...,speaking issue moonlighting hcl technology sai...
2,"Infosys' attrition drops to 27.1%, net employe...",Infosys on Thursday reported a 1.3% QoQ drop i...,technology,infosys thursday reported 13 qoq drop voluntar...,infosi thursday report 13 qoq drop voluntari a...,infosys thursday reported 13 qoq drop voluntar...
3,We do not support dual employment: Infosys on ...,Infosys CEO Salil Parekh spoke on the moonligh...,technology,infosys ceo salil parekh spoke moonlighting de...,infosi ceo salil parekh spoke moonlight debat ...,infosys ceo salil parekh spoke moonlighting de...
4,"Facebook users complain of losing followers, Z...",Several Facebook users have complained about l...,technology,several facebook users complained losing major...,sever facebook user complain lose major follow...,several facebook user complained losing majori...
...,...,...,...,...,...,...
95,Love the kind of thriller stories Sriram Ragha...,Actress Katrina Kaif said she loves the kind o...,entertainment,actress katrina kaif said loves kind thriller ...,actress katrina kaif said love kind thriller s...,actress katrina kaif said love kind thriller s...
96,Son Ram Charan & I returned 80% fee: Chiranjee...,Actor Chiranjeevi has revealed that he and his...,entertainment,actor chiranjeevi revealed son ram charan retu...,actor chiranjeevi reveal son ram charan return...,actor chiranjeevi revealed son ram charan retu...
97,Admired Emma in a way I could never explain: '...,"'Harry Potter' actor Tom Felton said he ""loved...",entertainment,' harry potter ' actor tom felton said loved a...,' harri potter ' actor tom felton said love ad...,' harry potter ' actor tom felton said loved a...
98,Emily Ratajkowski says she's 'single' amid rel...,Supermodel Emily Ratajkowski has confirmed tha...,entertainment,supermodel emily ratajkowski confirmed single ...,supermodel emili ratajkowski confirm singl ami...,supermodel emily ratajkowski confirmed single ...


In [36]:
codeup_df['clean'] = codeup_df.content.apply(lambda x: prepare.basic_clean(x)).apply(lambda x: prepare.tokenize(x)).apply(lambda x: prepare.remove_stopwords(x))
codeup_df['stemmed'] = codeup_df.clean.apply(lambda x: prepare.stem(x))
codeup_df['lemmatized'] = codeup_df.clean.apply(lambda x: prepare.lemmatize(x))


In [37]:
codeup_df

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...,codeup excited launch first diversity equity i...,codeup excit launch first divers equiti inclus...,codeup excited launch first diversity equity i...
1,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...,codeup named 2022 diversity inclusion award wi...,codeup name 2022 divers inclus award winner sa...,codeup named 2022 diversity inclusion award wi...
2,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...,deciding transition tech career big step signi...,decid transit tech career big step signific co...,deciding transition tech career big step signi...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio...",codeup strongly values diversity inclusion hon...,codeup strongli valu divers inclus honor ameri...,codeup strongly value diversity inclusion hono...
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...,many companies switching cloud services implem...,mani compani switch cloud servic implement clo...,many company switching cloud service implement...
5,2022 SABJ C-Suite Award Winner: Stephen Noteboom,"Codeup’s Chief Operating Officer, Stephen Note...",codeups chief operating officer stephen notebo...,codeup chief oper offic stephen noteboom 2022 ...,codeups chief operating officer stephen notebo...
