In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
import acquire

In [3]:
def basic_clean(string):
    """
    Lowercase the string
    Normalize unicode characters
    Replace anything that is not a letter, number, whitespace or a single quote.
    """
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # remove anything not a space character, an apostrophy, letter, or number
    string = re.sub(r"[^a-z0-9'\s]", '', string)

    # convert newlines and tabs to a single space
    string = re.sub(r'[\r|\n|\r\n]+', ' ', string)
    
    string = string.strip()
    return string

In [7]:
basic_clean("HOWDY's")

"howdy's"

In [8]:
basic_clean("ïñłińê")

'inine'

In [9]:
basic_clean("I'm")

"i'm"

In [12]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

In [13]:
tokenize("Hello, World!")

'Hello , World !'

In [14]:
tokenize("email@whatever.com")

'email@whatever.com'

In [15]:
tokenize("2018-19")

'2018-19'

In [16]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    string_of_stems = ' '.join(stems)
    return string_of_stems

In [21]:
stem("running into a house")

'run into a hous'

In [18]:
stem("was")

'wa'

In [30]:
stem("were")

'were'

In [24]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    string_of_lemmas = ' '.join(lemmas)
    return string_of_lemmas

In [25]:
lemmatize("running into a house")

'running into a house'

In [26]:
lemmatize("was")

'wa'

In [31]:
lemmatize("were")

'were'

In [49]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    
    # Tokenize the string
    string = tokenize(string)

    words = string.split()
    stopword_list = stopwords.words('english')

    # remove the excluded words from the stopword list
    stopword_list = set(stopword_list) - set(exclude_words)

    # add in the user specified extra words
    stopword_list = stopword_list.union(set(extra_words))

    filtered_words = [w for w in words if w not in stopword_list]
    final_string = " ".join(filtered_words)
    return final_string

In [42]:
stopwords_list = ["a", "an", "the", "bob", "jane"]

In [43]:
words_to_exclude = ["an", "bob"]

In [53]:
stopword_list = set(stopwords_list) - set(words_to_exclude)
stopword_list

{'a', 'jane', 'the'}

In [55]:
extra_words = ["codeup"]
stopword_list = stopword_list.union(set(extra_words))
stopword_list

{'a', 'codeup', 'jane', 'the'}

In [56]:
{1, 2, 3}.union({5, 6, 7})

{1, 2, 3, 5, 6, 7}

In [57]:
{1, 2, 3} - {3, 4, 5}

{1, 2}

In [58]:
a = {1, 2}

In [59]:
def prep_articles(df):
    df["original"] = df.body
    df["stemmed"] = df.body.apply(basic_clean).apply(stem)
    df["lemmatized"] = df.body.apply(basic_clean).apply(lemmatize)
    df["clean"] = df.body.apply(basic_clean).apply(remove_stopwords)
    df.drop(columns=["body"], inplace=True)
    return df

In [60]:
def prep_blog_posts():
    df = acquire.get_blog_posts()
    return prep_articles(df)

In [61]:
def prep_news_articles():
    df = acquire.get_news_articles()
    return prep_articles(df)

In [62]:
def prep_corpus():
    blog_df = prep_blog_posts()
    blog_df["source"] = "Codeup Blog"

    news_df = prep_news_articles()
    news_df["source"] = "InShorts News"

    return blog_df, news_df

In [63]:
codeup_df, news_df = prep_corpus()

In [65]:
codeup_df.head()

Unnamed: 0.1,Unnamed: 0,title,original,stemmed,lemmatized,clean,source
0,0,Codeup’s Data Science Career Accelerator is He...,\nThe rumors are true! The time has arrived. C...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...,rumors true time arrived codeup officially ope...,Codeup Blog
1,1,Data Science Myths - Codeup,\nBy Dimitri Antoniou and Maggie Giust\nData S...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...,dimitri antoniou maggie giust data science big...,Codeup Blog
2,2,Data Science VS Data Analytics: What’s The Dif...,"\nBy Dimitri Antoniou\nA week ago, Codeup laun...",by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...,dimitri antoniou week ago codeup launched imme...,Codeup Blog
3,3,10 Tips to Crush It at the SA Tech Job Fair - ...,\n10 Tips to Crush It at the SA Tech Job Fair\...,10 tip to crush it at the sa tech job fair sa ...,10 tip to crush it at the sa tech job fair sa ...,10 tips crush sa tech job fair sa tech job fai...,Codeup Blog
4,4,Competitor Bootcamps Are Closing. Is the Model...,\nCompetitor Bootcamps Are Closing. Is the Mod...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...,competitor bootcamps closing model danger prog...,Codeup Blog


In [67]:
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,category,author,published_date,original,stemmed,lemmatized,clean,source
0,0,"8, 7, 6.6, 5.8, 5 & 4.5 is the state of econom...",business,Pragya Swastik,2019-12-05T10:41:04.000Z,Former Finance Minister P Chidambaram on Thurs...,former financ minist p chidambaram on thursday...,former finance minister p chidambaram on thurs...,former finance minister p chidambaram thursday...,InShorts News
1,1,Sundar Pichai rejected Google shares worth mil...,business,Kanishka Pandey,2019-12-06T03:32:57.000Z,Google's 47-year-old India-born CEO Sundar Pic...,google' 47yearold indiaborn ceo sundar pichai ...,google's 47yearold indiaborn ceo sundar pichai...,google ' 47yearold indiaborn ceo sundar pichai...,InShorts News
2,2,"We are the same animal, we are both a little c...",business,Krishna Veera Vanamali,2019-12-06T16:27:31.000Z,SoftBank CEO Masayoshi Son has said the decisi...,softbank ceo masayoshi son ha said the decis t...,softbank ceo masayoshi son ha said the decisio...,softbank ceo masayoshi son said decision inves...,InShorts News
3,3,Gut feeling drove me to invest $20M in Alibaba...,business,Kanishka Pandey,2019-12-06T11:27:06.000Z,"SoftBank Founder and CEO Masayoshi Son, in a d...",softbank founder and ceo masayoshi son in a di...,softbank founder and ceo masayoshi son in a di...,softbank founder ceo masayoshi son discussion ...,InShorts News
4,4,Maharashtra govt suggests merger of PMC Bank w...,business,Krishna Veera Vanamali,2019-12-05T12:42:14.000Z,In a bid to provide relief to depositors of sc...,in a bid to provid relief to depositor of scam...,in a bid to provide relief to depositor of sca...,bid provide relief depositors scamhit punjab m...,InShorts News
