In [1]:
import unicodedata
import re
import json
import bs4

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire_ryan

### Exercises
The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

#### 1) Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(string):
    
    features="lxml"
        
    original = string
    article = original.lower()
    
    # Remove inconsistencies in unicode character encoding.
    # encode the strings into ASCII bytestrings (ignore non-ASCII characters)
    # decode the bytestring into (Unicode) string

    article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    article = re.sub(r"[^a-z0-9'\s]", '', article)
    #article = article.replace('\n', ' ')
    
    
    return article

In [3]:
string = 'Writers write descriptive paragraphs because their purpose is to describe something. Their point is that something is beautiful or disgusting or strangely intriguing. Writers write persuasive and argument paragraphs because their purpose is to persuade or convince someone. Their point is that their reader should see things a particular way and possibly take action on that new way of seeing things. Writers write paragraphs of comparison because the comparison will make their point clear to their readers.'

In [4]:
article = basic_clean(string)

In [5]:
article

'writers write descriptive paragraphs because their purpose is to describe something their point is that something is beautiful or disgusting or strangely intriguing writers write persuasive and argument paragraphs because their purpose is to persuade or convince someone their point is that their reader should see things a particular way and possibly take action on that new way of seeing things writers write paragraphs of comparison because the comparison will make their point clear to their readers'

def basic_clean():
    
    features="lxml"
    url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
    
    output = acquire_ryan.get_codeup_blog_output(url)
    article = output.lower()
    
    # Remove inconsistencies in unicode character encoding.
    # encode the strings into ASCII bytestrings (ignore non-ASCII characters)
    # decode the bytestring into (Unicode) string

    article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    article = re.sub(r"[^a-z0-9'\s]", '', article)
    #article = article.replace('\n', ' ')
    
    
    return article

------

### Define a function named tokenize. It should take in a string and tokenize all the words in the string.

----

In [6]:
# Create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

# Use the tokenizer
#article = tokenizer.tokenize(article, return_str = True)

In [7]:
def tokenize(string):
    
    ''' This function utilizes tokenizer tool and returns a transformed string '''
    
    article = basic_clean(string)
    
    article = tokenizer.tokenize(article, return_str = True)
    
    return article

In [8]:
article = tokenize(string)

In [9]:
article

'writers write descriptive paragraphs because their purpose is to describe something their point is that something is beautiful or disgusting or strangely intriguing writers write persuasive and argument paragraphs because their purpose is to persuade or convince someone their point is that their reader should see things a particular way and possibly take action on that new way of seeing things writers write paragraphs of comparison because the comparison will make their point clear to their readers'

-----------------

## Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

-----------

In [10]:
# Create porter stemmer.

ps = nltk.porter.PorterStemmer()

In [11]:
def stem(string):
    
    '''this function utilized the Porter Stemmer and returns a string '''
    
    article = tokenize(string)
    
    stems = [ps.stem(word) for word in article.split()]
    print(stems[:10])
    
    article_stemmed = ' '.join(stems)
    
    return article_stemmed

In [12]:
article_stemmed = stem(string)

['writer', 'write', 'descript', 'paragraph', 'becaus', 'their', 'purpos', 'is', 'to', 'describ']


In [13]:
article_stemmed

'writer write descript paragraph becaus their purpos is to describ someth their point is that someth is beauti or disgust or strang intrigu writer write persuas and argument paragraph becaus their purpos is to persuad or convinc someon their point is that their reader should see thing a particular way and possibl take action on that new way of see thing writer write paragraph of comparison becaus the comparison will make their point clear to their reader'

--------

### Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

________

In [14]:
# Create the Lemmatizer.

wnl = nltk.stem.WordNetLemmatizer()

In [15]:
# Check lemmatizer. It works.

wnl.lemmatize('influenced')

'influenced'

In [16]:
# make a function

def lemmatize(string):
    
    ''' This function takes in tokenized content and the applicted lemmatization. It returns content that has been
    transformed by lemmatize '''
    
    article = tokenize(string)
    
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    print(lemmas[:10])
    
    article_lemmatized = ' '.join(lemmas)
    
    return article_lemmatized

In [17]:
article_lemmatized = lemmatize(string)

['writer', 'write', 'descriptive', 'paragraph', 'because', 'their', 'purpose', 'is', 'to', 'describe']


In [18]:
article_lemmatized

'writer write descriptive paragraph because their purpose is to describe something their point is that something is beautiful or disgusting or strangely intriguing writer write persuasive and argument paragraph because their purpose is to persuade or convince someone their point is that their reader should see thing a particular way and possibly take action on that new way of seeing thing writer write paragraph of comparison because the comparison will make their point clear to their reader'

------------

### Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

-----------

In [19]:
# standard English language stopwords list from nltk
from nltk.corpus import stopwords

In [33]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords
    

In [34]:
article_without_stopwords = remove_stopwords(string, extra_words = ['no'], exclude_words=['\n', 'o', "'", 'ha'])

In [35]:
article_without_stopwords

'Writers write descriptive paragraphs purpose describe something. Their point something beautiful disgusting strangely intriguing. Writers write persuasive argument paragraphs purpose persuade convince someone. Their point reader see things particular way possibly take action new way seeing things. Writers write paragraphs comparison comparison make point clear readers.'

---------

### Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

----------

In [23]:
codeup_df = acquire_ryan.acquire_codeup_blog()



  soup = BeautifulSoup(response.text)


---------

### Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

--------

In [24]:
news_df = acquire_ryan.acquire_news_articles()



  soup = BeautifulSoup(response.text)


In [25]:
news_df.head()

Unnamed: 0,title,content,category
0,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",business
1,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,business
2,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,business
3,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",business
4,M&M advances annual maintenance plant shutdown...,Mahindra & Mahindra (M&M) said that it has adv...,business


-------------

### For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.


-------------

In [26]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['title', column, 'clean', 'stemmed', 'lemmatized']]