In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

## Exercises
The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.


In [2]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    string = re.sub(r'[^a-z0-9\'\s]', '', string)
    return string

In [3]:
sample_text = "He said they're wanting us to go to room *323, type  #$_sdfkllk_s into the 'keypad' and say سيببسس"
print(sample_text)
basic_clean(sample_text)

He said they're wanting us to go to room *323, type  #$_sdfkllk_s into the 'keypad' and say سيببسس


"he said they're wanting us to go to room 323 type  sdfkllks into the 'keypad' and say "

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [4]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

In [5]:
tokenize(basic_clean(sample_text))

"he said they ' re wanting us to go to room 323 type sdfkllks into the ' keypad ' and say"

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [6]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    return ' '.join(stems)

In [7]:
stem(tokenize(basic_clean(sample_text)))

"he said they ' re want us to go to room 323 type sdfkllk into the ' keypad ' and say"

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [8]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    return ' '.join(lemmas)

In [9]:
lemmatize(tokenize(basic_clean(sample_text)))

"he said they ' re wanting u to go to room 323 type sdfkllks into the ' keypad ' and say"

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [10]:
def remove_stopwords(string, extra_words = None, exclude_words = None):
    stopword_list = stopwords.words('english')
    if extra_words != None:
        stopword_list = stopword_list.append(extrawords)
    if exclude_words != None:
        stopword_list = stopword_list.remove(exclude_words)
    words = string.split()
    filtered_words = [w for w in words if w not in stopword_list]
    return ' '.join(filtered_words)

In [11]:
remove_stopwords(sample_text)

"He said they're wanting us go room *323, type #$_sdfkllk_s 'keypad' say سيببسس"

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [12]:
news_df = acquire.get_all_inshorts_articles()

In [13]:
news_df.head()

Unnamed: 0,title,content,category
0,"Rupee edges closer to 80 per US dollar, opens ...",The rupee on Friday opened at a record low of ...,business
1,Rupee drops 9 paise to close at all-time low o...,The rupee declined by 9 paise to close at a ne...,business
2,Musk accused lawyers of causing trouble by see...,"In its lawsuit against Tesla CEO Elon Musk, Tw...",business
3,No plans for company-wide layoffs: Twitter ami...,"In a filing, Twitter has said that it isn't lo...",business
4,Elon has a real record of success: LinkedIn Co...,LinkedIn's billionaire Co-founder Reid Hoffman...,business


In [23]:
news_df['combined'] = news_df[['category','title']].apply(' '.join, axis =1)
news_df

Unnamed: 0,title,content,category,combined
0,"Rupee edges closer to 80 per US dollar, opens ...",The rupee on Friday opened at a record low of ...,business,business Rupee edges closer to 80 per US dolla...
1,Rupee drops 9 paise to close at all-time low o...,The rupee declined by 9 paise to close at a ne...,business,business Rupee drops 9 paise to close at all-t...
2,Musk accused lawyers of causing trouble by see...,"In its lawsuit against Tesla CEO Elon Musk, Tw...",business,business Musk accused lawyers of causing troub...
3,No plans for company-wide layoffs: Twitter ami...,"In a filing, Twitter has said that it isn't lo...",business,business No plans for company-wide layoffs: Tw...
4,Elon has a real record of success: LinkedIn Co...,LinkedIn's billionaire Co-founder Reid Hoffman...,business,business Elon has a real record of success: Li...
...,...,...,...,...
95,My dad likes my films when I'm not remaking hi...,Actress Sara Ali Khan recently appeared on cel...,entertainment,entertainment My dad likes my films when I'm n...
96,Bagwati in background: Abhay on 11 yrs of 'Zin...,Actor Abhay Deol shared a series of selfies on...,entertainment,entertainment Bagwati in background: Abhay on ...
97,"'Chup' is a commercial thriller, will release ...",Director R Balki has said his upcoming film 'C...,entertainment,"entertainment 'Chup' is a commercial thriller,..."
98,"Playing quintessential B'wood hero is hardest,...",Actor Ranbir Kapoor said playing a quintessent...,entertainment,entertainment Playing quintessential B'wood he...


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [24]:
codeup_df = acquire.get_all_codeup_blogs()

In [25]:
codeup_df.head()

Unnamed: 0,title,date,content,link
0,In-Person Workshop: Learn to Code - Python on ...,"Jun 20, 2022",In-Person Workshop: Learn to Code – Python on ...,https://codeup.com/workshops/in-person-worksho...
1,Free JavaScript Workshop at Codeup Dallas on 6...,"Jun 19, 2022",Free JavaScript Workshop at Codeup Dallas on 6...,https://codeup.com/workshops/dallas/free-javas...
2,Is Our Cloud Administration Program Right for ...,"Jun 8, 2022",Is Our Cloud Administration Program Right for ...,https://codeup.com/tips-for-prospective-studen...
3,PRIDE in Tech Panel - Codeup,"Jun 5, 2022","PRIDE in Tech Panel\nJun 5, 2022 | Dallas, San...",https://codeup.com/workshops/pride-in-tech-panel/
4,Inclusion at Codeup During Pride Month (and Al...,"Jun 1, 2022",Inclusion at Codeup During Pride Month (and Al...,https://codeup.com/codeup-news/inclusion-at-co...


### 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

#### Changes made to news_df

In [26]:
news_df.drop(columns = 'category', inplace=True)

In [27]:
news_df.rename({'content':'original'}, axis=1, inplace=True)

In [28]:
news_df['clean'] = news_df.original.apply(basic_clean)

In [29]:
news_df['clean'] = news_df.clean.apply(tokenize)

In [30]:
news_df['clean'] = news_df.clean.apply(remove_stopwords)

In [31]:
news_df['stemmed'] = news_df.clean.apply(stem)

In [32]:
news_df['lemmatized'] = news_df.clean.apply(lemmatize)

In [33]:
news_df.head()

Unnamed: 0,title,original,combined,clean,stemmed,lemmatized
0,"Rupee edges closer to 80 per US dollar, opens ...",The rupee on Friday opened at a record low of ...,business Rupee edges closer to 80 per US dolla...,rupee friday opened record low 7994 us dollar ...,rupe friday open record low 7994 us dollar far...,rupee friday opened record low 7994 u dollar f...
1,Rupee drops 9 paise to close at all-time low o...,The rupee declined by 9 paise to close at a ne...,business Rupee drops 9 paise to close at all-t...,rupee declined 9 paise close new record low 79...,rupe declin 9 pais close new record low 7990 u...,rupee declined 9 paisa close new record low 79...
2,Musk accused lawyers of causing trouble by see...,"In its lawsuit against Tesla CEO Elon Musk, Tw...",business Musk accused lawyers of causing troub...,lawsuit tesla ceo elon musk twitter shared pur...,lawsuit tesla ceo elon musk twitter share purp...,lawsuit tesla ceo elon musk twitter shared pur...
3,No plans for company-wide layoffs: Twitter ami...,"In a filing, Twitter has said that it isn't lo...",business No plans for company-wide layoffs: Tw...,filing twitter said ' looking companywide layo...,file twitter said ' look companywid layoff cou...,filing twitter said ' looking companywide layo...
4,Elon has a real record of success: LinkedIn Co...,LinkedIn's billionaire Co-founder Reid Hoffman...,business Elon has a real record of success: Li...,linkedin ' billionaire cofounder reid hoffman ...,linkedin ' billionair cofound reid hoffman def...,linkedin ' billionaire cofounder reid hoffman ...


#### Changes made to codeup_df

In [34]:
codeup_df.drop(columns= ['date','link'], inplace=True)

In [35]:
codeup_df.rename({'content':'original'}, axis=1, inplace=True)

In [36]:
codeup_df['clean'] = codeup_df.original.apply(basic_clean)
codeup_df['clean'] = codeup_df.clean.apply(tokenize)
codeup_df['clean'] = codeup_df.clean.apply(remove_stopwords)
codeup_df['stemmed'] = codeup_df.clean.apply(stem)
codeup_df['lemmatized'] = codeup_df.clean.apply(lemmatize)
codeup_df.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,In-Person Workshop: Learn to Code - Python on ...,In-Person Workshop: Learn to Code – Python on ...,inperson workshop learn code python 719 jun 20...,inperson workshop learn code python 719 jun 20...,inperson workshop learn code python 719 jun 20...
1,Free JavaScript Workshop at Codeup Dallas on 6...,Free JavaScript Workshop at Codeup Dallas on 6...,free javascript workshop codeup dallas 628 jun...,free javascript workshop codeup dalla 628 jun ...,free javascript workshop codeup dallas 628 jun...
2,Is Our Cloud Administration Program Right for ...,Is Our Cloud Administration Program Right for ...,cloud administration program right jun 8 2022 ...,cloud administr program right jun 8 2022 featu...,cloud administration program right jun 8 2022 ...
3,PRIDE in Tech Panel - Codeup,"PRIDE in Tech Panel\nJun 5, 2022 | Dallas, San...",pride tech panel jun 5 2022 dallas san antonio...,pride tech panel jun 5 2022 dalla san antonio ...,pride tech panel jun 5 2022 dallas san antonio...
4,Inclusion at Codeup During Pride Month (and Al...,Inclusion at Codeup During Pride Month (and Al...,inclusion codeup pride month always jun 1 2022...,inclus codeup pride month alway jun 1 2022 cod...,inclusion codeup pride month always jun 1 2022...
