In [1]:
import unicodedata
import re
import json
import os
from requests import get
from bs4 import BeautifulSoup

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
import numpy as np

import acquire as ac


## Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

### Lowercase everything
### Normalize unicode characters
### Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    return text

## Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [3]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()

    string= tokenizer.tokenize(string, return_str=True)
    return string


## Define a function named stem. It should accept some text and return the text after applying stemming to all the words.



In [4]:
def stem(text):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    text = ' '.join(stems)
    return text 

## Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.



In [5]:
def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    text = ' '.join(lemmas)
    return text

## Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

## This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [6]:
def remove_stopwords(text, extra_words=None, exclude_words=None):
    stopword_list = stopwords.words('english')
    
    stopword_list.remove(exclude_words)
    words = text.split()
    filtered_words = [w for w in extra_words if w not in stopword_list]
    
    text = ' '.join(filtered_words)
    return text
    

## Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.



In [7]:
news_df=ac.get_news_articles(['business','sports','technology','entertainment'])

In [8]:
news_df=pd.DataFrame(news_df)
news_df

Unnamed: 0,title,text,category
0,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...,business
1,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...,business
2,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...,business
3,Adani secures $3 bn credit from a sovereign we...,Adani Group has reportedly told creditors it h...,business
4,We can score a century for progress: Gates on ...,Microsoft Co-founder Bill Gates shared a messa...,business
...,...,...,...
95,Nandita said she won't make Zwigato with a sta...,Comedian-actor Kapil Sharma said that when Nan...,entertainment
96,His wife Aaliya has tolerated a lot: Shamas on...,Actor Nawazuddin Siddiqui's brother Shamas Naw...,entertainment
97,We'd end up matching clothes without planning:...,Speaking about the similarities that she found...,entertainment
98,"Kartik Aaryan announces 'Bhool Bhulaiyaa 3', t...",Actor Kartik Aaryan took to Instagram and anno...,entertainment


## Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.



In [9]:
codeup_df=ac.get_blog_articles()
codeup_df=pd.DataFrame(codeup_df)

In [10]:
codeup_df

Unnamed: 0,title,date_published,content
0,Coding Bootcamp or Self-Learning? Which is Bes...,"Jan 20, 2023",\nIf you’re interested in embarking on a caree...
1,Codeup Among Top 58 Best Coding Bootcamps of 2023,"Jan 12, 2023",\nCodeup is pleased to announce we have been r...
2,Black excellence in tech: Panelist Spotlight –...,"Feb 6, 2023",\nBlack excellence in tech: Panelist Spotlight...
3,Black excellence in tech: Panelist Spotlight –...,"Feb 10, 2023",\nBlack excellence in tech: Panelist Spotlight...
4,Black excellence in tech: Panelist Spotlight –...,"Feb 13, 2023",\nBlack excellence in tech: Panelist Spotlight...


## For each dataframe, produce the following columns:

* title to hold the title
* original to hold the original article/post content
* clean to hold the normalized and tokenized original with the stopwords removed.
* stemmed to hold the stemmed version of the cleaned data.
* lemmatized to hold the lemmatized version of the cleaned data.

### news

In [11]:
news_df['text']

0     All 10 Adani Group stocks closed higher on Wed...
1     Hours after the central government raised the ...
2     Indian-Americans Punit Renjen and Rajesh Subra...
3     Adani Group has reportedly told creditors it h...
4     Microsoft Co-founder Bill Gates shared a messa...
                            ...                        
95    Comedian-actor Kapil Sharma said that when Nan...
96    Actor Nawazuddin Siddiqui's brother Shamas Naw...
97    Speaking about the similarities that she found...
98    Actor Kartik Aaryan took to Instagram and anno...
99    Kapil Sharma, in a recent episode of 'The Kapi...
Name: text, Length: 100, dtype: object

In [15]:
news_df['tokenized']=news_df['text'].apply(basic_clean)

In [16]:
news_df['tokenized']

0     all 10 adani group stocks closed higher on wed...
1     hours after the central government raised the ...
2     indianamericans punit renjen and rajesh subram...
3     adani group has reportedly told creditors it h...
4     microsoft cofounder bill gates shared a messag...
                            ...                        
95    comedianactor kapil sharma said that when nand...
96    actor nawazuddin siddiqui's brother shamas naw...
97    speaking about the similarities that she found...
98    actor kartik aaryan took to instagram and anno...
99    kapil sharma in a recent episode of 'the kapil...
Name: tokenized, Length: 100, dtype: object

In [17]:
news_df['tokenized']=news_df['tokenized'].apply(tokenize)

In [18]:
news_df['tokenized']

0     all 10 adani group stocks closed higher on wed...
1     hours after the central government raised the ...
2     indianamericans punit renjen and rajesh subram...
3     adani group has reportedly told creditors it h...
4     microsoft cofounder bill gates shared a messag...
                            ...                        
95    comedianactor kapil sharma said that when nand...
96    actor nawazuddin siddiqui ' s brother shamas n...
97    speaking about the similarities that she found...
98    actor kartik aaryan took to instagram and anno...
99    kapil sharma in a recent episode of ' the kapi...
Name: tokenized, Length: 100, dtype: object

In [21]:
news_df['stemmed']=news_df['tokenized'].apply(stem)

In [22]:
news_df['stemmed']

0     all 10 adani group stock close higher on wedne...
1     hour after the central govern rais the price o...
2     indianamerican punit renjen and rajesh subrama...
3     adani group ha reportedli told creditor it ha ...
4     microsoft cofound bill gate share a messag on ...
                            ...                        
95    comedianactor kapil sharma said that when nand...
96    actor nawazuddin siddiqui ' s brother shama na...
97    speak about the similar that she found with he...
98    actor kartik aaryan took to instagram and anno...
99    kapil sharma in a recent episod of ' the kapil...
Name: stemmed, Length: 100, dtype: object

In [27]:
news_df['lemmatized']=news_df['tokenized'].apply(lemmatize)

In [28]:
news_df['lemmatized']

0     all 10 adani group stock closed higher on wedn...
1     hour after the central government raised the p...
2     indianamericans punit renjen and rajesh subram...
3     adani group ha reportedly told creditor it ha ...
4     microsoft cofounder bill gate shared a message...
                            ...                        
95    comedianactor kapil sharma said that when nand...
96    actor nawazuddin siddiqui ' s brother shamas n...
97    speaking about the similarity that she found w...
98    actor kartik aaryan took to instagram and anno...
99    kapil sharma in a recent episode of ' the kapi...
Name: lemmatized, Length: 100, dtype: object

In [29]:
news_df.head()

Unnamed: 0,title,text,category,tokenized,stemmed,lemmatized
0,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...,business,all 10 adani group stocks closed higher on wed...,all 10 adani group stock close higher on wedne...,all 10 adani group stock closed higher on wedn...
1,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...,business,hours after the central government raised the ...,hour after the central govern rais the price o...,hour after the central government raised the p...
2,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...,business,indianamericans punit renjen and rajesh subram...,indianamerican punit renjen and rajesh subrama...,indianamericans punit renjen and rajesh subram...
3,Adani secures $3 bn credit from a sovereign we...,Adani Group has reportedly told creditors it h...,business,adani group has reportedly told creditors it h...,adani group ha reportedli told creditor it ha ...,adani group ha reportedly told creditor it ha ...
4,We can score a century for progress: Gates on ...,Microsoft Co-founder Bill Gates shared a messa...,business,microsoft cofounder bill gates shared a messag...,microsoft cofound bill gate share a messag on ...,microsoft cofounder bill gate shared a message...


### codeup_df

In [30]:
codeup_df['tokenized']=codeup_df['content'].apply(basic_clean)

In [31]:
codeup_df['tokenized']=codeup_df['content'].apply(tokenize)

In [32]:
codeup_df['stemmed']=codeup_df['content'].apply(stem)

In [33]:
codeup_df['lemmatized']=codeup_df['content'].apply(lemmatize)

In [34]:
codeup_df.head()

Unnamed: 0,title,date_published,content,tokenized,stemmed,lemmatized
0,Coding Bootcamp or Self-Learning? Which is Bes...,"Jan 20, 2023",\nIf you’re interested in embarking on a caree...,If you ’ re interested in embarking on a caree...,if you’r interest in embark on a career in tec...,If you’re interested in embarking on a career ...
1,Codeup Among Top 58 Best Coding Bootcamps of 2023,"Jan 12, 2023",\nCodeup is pleased to announce we have been r...,Codeup is pleased to announce we have been ran...,codeup is pleas to announc we have been rank a...,Codeup is pleased to announce we have been ran...
2,Black excellence in tech: Panelist Spotlight –...,"Feb 6, 2023",\nBlack excellence in tech: Panelist Spotlight...,Black excellence in tech : Panelist Spotlight ...,black excel in tech: panelist spotlight – jean...,Black excellence in tech: Panelist Spotlight –...
3,Black excellence in tech: Panelist Spotlight –...,"Feb 10, 2023",\nBlack excellence in tech: Panelist Spotlight...,Black excellence in tech : Panelist Spotlight ...,black excel in tech: panelist spotlight – jame...,Black excellence in tech: Panelist Spotlight –...
4,Black excellence in tech: Panelist Spotlight –...,"Feb 13, 2023",\nBlack excellence in tech: Panelist Spotlight...,Black excellence in tech : Panelist Spotlight ...,black excel in tech: panelist spotlight – step...,Black excellence in tech: Panelist Spotlight –...


## Ask yourself:

* If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    * lemmatized
* If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    * either one would be ok
* If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    * stemmed data