In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as aq

#### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [19]:
mess = "A lazy 7 DoG's that HOppEd 15 tImes! @ 10 O'Clock the cat t[ook a ;shot. but WHY???"

In [27]:
def basic_clean(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore')
    text = re.sub(r"[^a-z0-9\s]", '', text)
    return text
    
    
    

In [20]:
clean = basic_clean(mess)
clean

'a lazy 7 dogs that hopped 15 times  10 oclock the cat took a shot but why'

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [3]:
def tokenize(text):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(text, return_str=True)

In [21]:
tokened = tokenize(clean)
tokened

'a lazy 7 dogs that hopped 15 times 10 oclock the cat took a shot but why'

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [22]:
def stem(text):
    ps = nltk.porter.PorterStemmer()
    words = text.split()
    stems = [ps.stem(word) for word in words]
    return ' '.join(stems)

In [23]:
stemmed = stem(tokened)
stemmed

'a lazi 7 dog that hop 15 time 10 oclock the cat took a shot but whi'

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [5]:
def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    return ' '.join(lemmas)

In [24]:
lemmed = lemmatize(tokened)
lemmed

'a lazy 7 dog that hopped 15 time 10 oclock the cat took a shot but why'

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [6]:
def remove_stopwords(text, stopword_list=stopwords.words('english')):
    words = text.split()
    filtered_words = [word for word in words if word not in stopword_list]
    return ' '.join(filtered_words)

In [25]:
stm = remove_stopwords(stemmed)
stm

'lazi 7 dog hop 15 time 10 oclock cat took shot whi'

In [26]:
lm = remove_stopwords(lemmed)
lm

'lazy 7 dog hopped 15 time 10 oclock cat took shot'

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [7]:
news_df = aq.get_news_articles()

Reading from local CSV...


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [8]:
codeup_df = aq.get_blog_articles()

Reading from local CSV...


### 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [28]:
def nlp_prep(df):
    df = df.rename(columns={'content':'original'})
    df['clean'] = (df.original.apply(basic_clean)
                     .apply(tokenize)
                     .apply(remove_stopwords)
                  )
    df['stemmed'] = df.clean.apply(stem)
    df['lemmatized'] = df.clean.apply(lemmatize)
    return df

In [10]:
news_df.head(1)

Unnamed: 0,title,author,content,category
0,India's GDP grows at 13.5% in first quarter of...,Anmol Sharma,India's GDP grew at 13.5% in the first quarter...,business


In [29]:
news_df = nlp_prep(news_df)
news_df.head()

Unnamed: 0,title,author,original,category,clean,stemmed,lemmatized
0,India's GDP grows at 13.5% in first quarter of...,Anmol Sharma,India's GDP grew at 13.5% in the first quarter...,business,indias gdp grew 135 first quarter fy23 achievi...,india gdp grew 135 first quarter fy23 achiev f...,india gdp grew 135 first quarter fy23 achievin...
1,"Snap to lay off 20% of staff, cancel several p...",Ananya Goyal,Snap said on Wednesday it will lay off 20% of ...,business,snap said wednesday lay 20 staff shut original...,snap said wednesday lay 20 staff shut origin s...,snap said wednesday lay 20 staff shut original...
2,2 top executives at Snap quit hours after repo...,Ridham Gambhir,Two senior advertising executives at Snap quit...,business,two senior advertising executives snap quit ho...,two senior advertis execut snap quit hour repo...,two senior advertising executive snap quit hou...
3,Musk seeks to delay Twitter trial to Nov amid ...,Ridham Gambhir,Tesla CEO Elon Musk is seeking to delay the tr...,business,tesla ceo elon musk seeking delay trial twitte...,tesla ceo elon musk seek delay trial twitter n...,tesla ceo elon musk seeking delay trial twitte...
4,Viral video shows Amazon parcels thrown out of...,Apaar Sharma,A video from Guwahati railway station has gone...,business,video guwahati railway station gone viral show...,video guwahati railway station gone viral show...,video guwahati railway station gone viral show...


In [30]:
codeup_df = nlp_prep(codeup_df)
codeup_df.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Is a Career in Tech Recession-Proof?,"Given the current economic climate, many econo...",given current economic climate many economists...,given current econom climat mani economist con...,given current economic climate many economist ...
1,Codeup X Superhero Car Show & Comic Con,Codeup had a blast at the San Antonio Superher...,codeup blast san antonio superhero car show co...,codeup blast san antonio superhero car show co...,codeup blast san antonio superhero car show co...
2,What Jobs Can You Get After a Coding Bootcamp?...,If you’re considering a career in web developm...,youre considering career web development dont ...,your consid career web develop dont know expec...,youre considering career web development dont ...
3,Codeup’s New Dallas Campus,Codeup’s Dallas campus has a new location! For...,codeups dallas campus new location two years c...,codeup dalla campu new locat two year codeup o...,codeups dallas campus new location two year co...
4,Codeup TV Commercial,Codeup has officially made its TV debut! Our c...,codeup officially made tv debut community stud...,codeup offici made tv debut commun student sta...,codeup officially made tv debut community stud...
