In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

from acquire import * 

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

> * Lowercase everything
> * Normalize unicode characters
> * Replace anything that is not a letter, number, whitespace or a single quote.


In [2]:
test = 'real words here'

In [3]:
def basic_clean(s: str):
    '''Takes in string and converts everything to lowercase
    normalizes unicode characters and replaces everything that's
    not a number letter or whitespace with a single quote.
    '''
    return  re.sub(r"[^a-z0-9'\s]",
                   '', unicodedata.normalize('NFKD', s.lower()
                                  ).encode('ascii', 'ignore'
                                          ).decode('utf-8', 'ignore'))

    

### 2. Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.

In [4]:
def tokenize(s: str):
    '''Tokenizes all words within the string
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()

    return tokenizer.tokenize(s)

tokenize(basic_clean(test))

['real', 'words', 'here']

### 3. Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [5]:
def stem(s: str):
    '''Accepts string and returns the string after they have been stemmed
    '''
    # Create stemmer
    ps = nltk.porter.PorterStemmer()

    return ' '.join([ps.stem(word) for word in s.split()])

test = stem(' '.join(tokenize(basic_clean(test))))
test

'real word here'

### 4. Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [6]:
def lemmatize(s: str):
    '''Takes a string and returns a lemmatization of that string
    '''
    # Create the Word Net Lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    
    return [wnl.lemmatize(word) for word in s.split()]
    
lemmatize(test)

['real', 'word', 'here']

### 5. Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.

### This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [7]:
def remove_stopwords(s: str, extra_words=list(), exclude_words=list()):
    '''Takes a string and returns a string with the stopwords removed
    also adds the desired extrawords and excluded words
    '''
    # Imports the english stopwords
    stopword_list = stopwords.words('english')

    # What words you'd like to include
    stopword_list.extend(extra_words)
    
    # Iterate though each desired exclusionary words
    for word in exclude_words:
        # remove word from stoplist
        stopword_list.remove(word)
        
    # Remove words from the string
    words = s.split()
    # Add all the words that are NOT in the stoplist 
    return [word for word in words if word not in stopword_list]
    
remove_stopwords(test)

['real', 'word']

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [8]:
# Fetch news article DataFrame
news_df = get_news_articles()
news_df.head()

Unnamed: 0_level_0,parsed,news_articles,author,date
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://inshorts.com/en/news/airstrike-hits-capital-of-ethiopias-tigray-3-killed-report-1635433206925,True,"An Ethiopian airstrike reportedly hit Mekelle,...",Dharini MudgalDharini Mudgal,28 Oct
https://inshorts.com/en/read/technology,True,Facebook on Thursday announced it's changing t...,Pragya SwastikPragya SwastikAnkush VermaAnkush...,29 Oct28 Oct29 Oct28 Oct28 Oct29 Oct28 Oct29 O...
https://inshorts.com/en/read/startup,True,The combined net worth of the world's two rich...,Kiran KhatriKiran KhatriAnkush VermaAnkush Ver...,28 Oct28 Oct27 Oct27 Oct27 Oct28 Oct28 Oct27 O...
https://inshorts.com/prev/en/news/airstrike-hits-capital-of-ethiopias-tigray-3-killed-report-1635433206925,True,After Aryan Khan was granted bail in the Mumba...,Kriti KambiriKriti Kambiri,28 Oct
https://inshorts.com/en/read/science,True,"'Star Trek' actor William Shatner, who became ...",Daisy MowkeDaisy MowkeAnkush VermaAnkush Verma...,16 Oct23 Oct28 Oct20 Oct21 Oct21 Oct29 Oct21 O...


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [13]:
codeup_df = get_codeup_blogs()
codeup_df.head()

Unnamed: 0_level_0,parsed,original,title
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://codeup.com/blog/,True,"Oct 28, 2021 | IT Training, Tips for Prospecti...",Blog - Codeup
https://codeup.com/codeup-news/codeup-acquires-rackspace-cloud-academy/,True,"Apr 16, 2021 | Codeup NewsWe are thrilled to o...",Codeup Acquires Rackspace Cloud Academy! - Codeup
https://codeup.com/codeup-news/start-a-new-career-with-vet-tec/,True,"Apr 5, 2021 | Codeup NewsAre you a veteran loo...",Start a New Career with VET TEC! - Codeup
https://codeup.com/blog/page/2/?et_blog,True,"Oct 3, 2021 | Behind the BillboardsSep 16, 202...",Blog - Codeup
https://codeup.com/,True,Go from no experience to employed tech profess...,Best Tech Career Accelerator in Texas - Codeup
