In [1]:
import pandas as pd
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

import acquire

Get Data

In [None]:
blog = acquire.get_blog_articles()
blog

remove new line characters

In [3]:
for col in blog:
    blog[col] = blog[col].apply(lambda x: x.replace('\n', ' '))
blog

Unnamed: 0,content,title
0,The rumors are true! The time has arrived. Co...,Codeup’s Data Science Career Accelerator is He...
1,By Dimitri Antoniou and Maggie Giust Data Sci...,Data Science Myths - Codeup
2,"By Dimitri Antoniou A week ago, Codeup launch...",Data Science VS Data Analytics: What’s The Dif...
3,10 Tips to Crush It at the SA Tech Job Fair S...,10 Tips to Crush It at the SA Tech Job Fair - ...
4,Competitor Bootcamps Are Closing. Is the Mode...,Competitor Bootcamps Are Closing. Is the Model...


1) Convert text to all lower case for normalcy.
 

In [4]:
for col in blog:
    blog[col]= blog[col].str.lower()
blog

Unnamed: 0,content,title
0,the rumors are true! the time has arrived. co...,codeup’s data science career accelerator is he...
1,by dimitri antoniou and maggie giust data sci...,data science myths - codeup
2,"by dimitri antoniou a week ago, codeup launch...",data science vs data analytics: what’s the dif...
3,10 tips to crush it at the sa tech job fair s...,10 tips to crush it at the sa tech job fair - ...
4,competitor bootcamps are closing. is the mode...,competitor bootcamps are closing. is the model...


2) Remove any accented characters, non-ASCII characters.

3) Remove special characters.

In [5]:
for col in blog:
    blog[col] = blog[col].apply(lambda x: \
        unicodedata.normalize('NFKD', x)\
            .encode('ascii', 'ignore')\
            .decode('utf-8', 'ignore'))
    blog[col] = blog[col].apply(lambda x: re.sub(r"[^a-z0-9'\s]", '', x))
blog

Unnamed: 0,content,title
0,the rumors are true the time has arrived code...,codeups data science career accelerator is her...
1,by dimitri antoniou and maggie giust data sci...,data science myths codeup
2,by dimitri antoniou a week ago codeup launche...,data science vs data analytics whats the diffe...
3,10 tips to crush it at the sa tech job fair s...,10 tips to crush it at the sa tech job fair c...
4,competitor bootcamps are closing is the model...,competitor bootcamps are closing is the model ...


4) Tokenize words

In [6]:
tokenizer = nltk.tokenize.ToktokTokenizer()
for col in blog:
    blog[col] = blog[col].apply(lambda x: tokenizer.tokenize(x, return_str=True))
blog

Unnamed: 0,content,title
0,the rumors are true the time has arrived codeu...,codeups data science career accelerator is her...
1,by dimitri antoniou and maggie giust data scie...,data science myths codeup
2,by dimitri antoniou a week ago codeup launched...,data science vs data analytics whats the diffe...
3,10 tips to crush it at the sa tech job fair sa...,10 tips to crush it at the sa tech job fair co...
4,competitor bootcamps are closing is the model ...,competitor bootcamps are closing is the model ...


5) Lemmatize the words.


In [7]:
wn1 = nltk.stem.WordNetLemmatizer()

for col in blog:
    blog[col] = blog[col].apply(lambda x: ' '.join([wn1.lemmatize(word) for word in x.split()]))
blog

Unnamed: 0,content,title
0,the rumor are true the time ha arrived codeup ...,codeups data science career accelerator is her...
1,by dimitri antoniou and maggie giust data scie...,data science myth codeup
2,by dimitri antoniou a week ago codeup launched...,data science v data analytics whats the differ...
3,10 tip to crush it at the sa tech job fair sa ...,10 tip to crush it at the sa tech job fair codeup
4,competitor bootcamps are closing is the model ...,competitor bootcamps are closing is the model ...


6) stem words.

In [8]:
one_post = blog.content[0]
ps = nltk.porter.PorterStemmer()
stems = ' '.join([ps.stem(word) for word in one_post.split()])
stems


'the rumor are true the time ha arriv codeup ha offici open applic to our new data scienc career acceler with onli 25 seat avail thi immers program is one of a kind in san antonio and will help you land a job in glassdoor 1 best job in america data scienc is a method of provid action intellig from data the data revolut ha hit san antonio result in an explos in data scientist posit across compani like usaa accentur booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecur center and school of data scienc we built a program to specif meet the grow demand of thi industri our program will be 18 week long fulltim handson and projectbas our curriculum develop and instruct is led by senior data scientist maggi giust who ha work at heb capit group and rackspac along with input from dozen of practition and hire partner student will work with real data set realist problem and the entir data scienc pipelin from collect to deploy they will receiv profession develop train in resu

7) Remove stopwords

In [9]:
'ha' in stopwords.words('english')


False

In [10]:
sw_list = stopwords.words('english')
words = one_post.split()
' '.join([word for word in words if word not in sw_list])

'rumor true time ha arrived codeup ha officially opened application new data science career accelerator 25 seat available immersive program one kind san antonio help land job glassdoors 1 best job america data science method providing actionable intelligence data data revolution ha hit san antonio resulting explosion data scientist position across company like usaa accenture booz allen hamilton heb weve even seen utsa invest 70 cybersecurity center school data science built program specifically meet growing demand industry program 18 week long fulltime handson projectbased curriculum development instruction led senior data scientist maggie giust ha worked heb capital group rackspace along input dozen practitioner hiring partner student work real data set realistic problem entire data science pipeline collection deployment receive professional development training resume writing interviewing continuing education prepare smooth transition workforce focus applied data science immediate im

8) Prep Data

In [11]:
from prepare import remove_non_ascii, remove_special_characters, tokenize, lemmatize, stem, remove_stopwords

In [12]:
for col in blog:
    blog[col] = blog[col].apply(remove_non_ascii) 
    blog[col] = blog[col].apply(remove_special_characters) 
    blog[col] = blog[col].apply(tokenize)
    blog[col] = blog[col].apply(lemmatize)
    blog[col] = blog[col].apply(remove_stopwords)

    

In [13]:
from prepare import basic_clean
import acquire

In [None]:
blog = acquire.get_blog_articles()


In [None]:
clean_dataframe(blog)

In [None]:
def basic_clean(df, stem_or_lem = 'lemmatize'):
    for col in df:
        df[col] = df[col].apply(lambda x: re.sub(r'[\r|\n|\r\n]+', ' ', x))
        df[col] = df[col].apply(remove_non_ascii) 
        df[col] = df[col].apply(remove_special_characters) 
        df[col] = df[col].apply(tokenize)
        if stem_or_lem == 'lemmatize':
            df[col] = df[col].apply(lemmatize)
        elif stem_or_lem == 'stem':
            df[col] = df[col].apply(stem)
        df[col] = df[col].apply(remove_stopwords)
    return df

In [None]:
basic_clean(blog)

In [None]:
tokenizer = ToktokTokenizer()
tokenizer.tokenize('hello world')