In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [2]:
original = '''The rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in Glassdoor’s #1 Best Job in America. Data Science is a method of providing actionable intelligence from data. The data revolution has hit San Antonio, resulting in an explosion in Data Scientist positions across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen UTSA invest $70 M for a Cybersecurity Center and School of Data Science. We built a program to specifically meet the growing demands of this industry. Our program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspace, along with input from dozens of practitioners and hiring partners. Students will work with real data sets, realistic problems, and the entire data science pipeline from collection to deployment. They will receive professional development training in resume writing, interviewing, and continuing education to prepare for a smooth transition to the workforce. We focus on applied data science for immediate impact and ROI in a business, which is how we can back it all up with a 6 month tuition refund guarantee – just like our existing Web Dev program. We’re focusing on Data Science with Python, SQL, and ML, covered in 14 modules: 1) Fundamentals; 2) Applied statistics; 3) SQL; 4) Python; 5) Supervised machine learning – regression; 6) Supervised machine learning – classification; 7) Unsupervised machine learning – clustering; 8) Time series analysis; 9) Anomaly detection; 10) Natural language processing; 11) Distributed machine learning; 12) Advanced topics (deep learning, NoSQL, cloud deployment, etc.); 13) Storytelling with data; and 14) Domain expertise development. Applications are now open for Codeup’s first Data Science cohort, which will start class on February 4, 2019. Hurry – there are only 25 seats available! To further our mission of cultivating inclusive growth, scholarships will be available to women, minorities, LGBTQIA+ individuals, veterans, first responders, and people relocating to San Antonio. If you want to learn about joining our program or hiring our graduates, email datascience@codeup.com!'''


In [3]:
def basic_clean(string):
    lower = string.lower()
    normal = unicodedata.normalize('NFKD', lower)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore')
    clean = re.sub(r"[^a-z0-9'\s]", '', normal)
    return clean

In [4]:
# basic_clean(original)

In [5]:
def tokenize(string):
    string = basic_clean(string)
    tokenizer = nltk.tokenize.ToktokTokenizer()
    tokenized = tokenizer.tokenize(string, return_str=True)
    return tokenized

In [6]:
# tokenize(original)

In [7]:
def stem(string):
    string = basic_clean(string)
    string = tokenize(string)
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    stemmed = ' '.join(stems)
    return stemmed

In [8]:
# stem(original)

In [9]:
def lemmatize(string):
    string = basic_clean(string)
    string = tokenize(string)
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    lemmatized = ' '.join(lemmas)
    return lemmatized

In [10]:
# lemmatize(original)

In [11]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    string = basic_clean(string)
    string = tokenize(string)
    stopword_list = stopwords.words('english')
    stopword_list += extra_words
    stopword_list = list(set(stopword_list) - set(exclude_words))
    words = string.split()
    filtered_words = [w for w in words if w not in stopword_list]
    string_without_stopwords = ' '.join(filtered_words)
    return string_without_stopwords

In [12]:
clean = basic_clean(original)

In [13]:
remove_stopwords(clean, exclude_words=['the','this'],
                 extra_words=['true'])

'the rumors the time arrived codeup officially opened applications new data science career accelerator 25 seats available this immersive program one kind san antonio help land job glassdoors 1 best job america data science method providing actionable intelligence data the data revolution hit san antonio resulting explosion data scientist positions across companies like usaa accenture booz allen hamilton heb weve even seen utsa invest 70 cybersecurity center school data science built program specifically meet the growing demands this industry program 18 weeks long fulltime handson projectbased curriculum development instruction led senior data scientist maggie giust worked heb capital group rackspace along input dozens practitioners hiring partners students work real data sets realistic problems the entire data science pipeline collection deployment receive professional development training resume writing interviewing continuing education prepare smooth transition the workforce focus ap

In [14]:
urls = acquire.get_codeup_url_list()
df = acquire.get_blog_articles(urls)
df

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie GiustData Scien...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri AntoniouA week ago, Codeup launched..."
3,10 Tips to Crush It at the SA Tech Job Fair - ...,10 Tips to Crush It at the SA Tech Job FairSA ...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


In [15]:
def prepare_article_data(df):
    df['stemmed'] = df.content.apply(stem)
    df['lemmatized'] = df.content.apply(lemmatize)
    df['clean'] = df.lemmatized.apply(remove_stopwords)
    return df

In [18]:
df = prepare_article_data(df)
df

Unnamed: 0,title,content,stemmed,lemmatized,clean
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...,rumor true time ha arrived codeup ha officiall...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie GiustData Scien...,by dimitri antoni and maggi giustdata scienc b...,by dimitri antoniou and maggie giustdata scien...,dimitri antoniou maggie giustdata science big ...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri AntoniouA week ago, Codeup launched...",by dimitri antonioua week ago codeup launch ou...,by dimitri antonioua week ago codeup launched ...,dimitri antonioua week ago codeup launched imm...
3,10 Tips to Crush It at the SA Tech Job Fair - ...,10 Tips to Crush It at the SA Tech Job FairSA ...,10 tip to crush it at the sa tech job fairsa t...,10 tip to crush it at the sa tech job fairsa t...,10 tip crush sa tech job fairsa tech job fairt...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...,competitor bootcamps closing model danger prog...


In [17]:
df.content[2]

'By Dimitri AntoniouA week ago, Codeup launched our immersive Data Science career accelerator! With our first class kicking off in February and only 25 seats available, we’ve been answering a lot of questions from prospective students. One in particular has come up so many times we decided to dedicate a blog post to it. What is the difference between data science and data analytics?First, let’s define some of our terms! Take a look at this blog to understand what Data Science is. In short, it is a method of turning raw data into action, leading to a desired outcome. Big Data refers to data sets that are large and complex, usually exceeding the capacity of computers and normal processing power to deal with. Machine Learning is the process of ‘learning’ underlying patterns of data in order to automate the extraction of intelligence from that data.Now, let’s look at the data pipeline that data scientists work through to reach the actionable insights and outcomes we mentioned:We start by c