In [36]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
import acquire
from time import strftime

In [37]:
df_posts = acquire.get_all_posts()
df_posts.head()

Using cached version of posts


Unnamed: 0,title,date,body
0,Learn to Code: Python Workshop on 4/23,"Mar 31, 2022","According to LinkedIn, the “#1 Most Promising ..."
1,Coming Soon: Cloud Administration,"Mar 17, 2022",We’re launching a new program out of San Anton...
2,5 Books Every Woman In Tech Should Read,"Mar 8, 2022",On this International Women’s Day 2022 we want...
3,Codeup Start Dates for March 2022,"Jan 26, 2022",As we approach the end of January we wanted to...
4,VET TEC Funding Now Available For Dallas Veterans,"Jan 7, 2022",We are so happy to announce that VET TEC benef...


In [38]:
def basic_clean(text):
    """
    Basic cleaning of text
    """
    text = unicodedata.normalize("NFKD", text)\
        .encode("ascii", "ignore")\
        .decode("utf-8", "ignore")
    text = re.sub(r'[^\w\s]', " ", text).lower()
    return text

In [39]:
# set all dtypes to string
df_posts = df_posts.astype(str)


In [40]:
df_posts["clean_body"] = df_posts["body"].apply(basic_clean)
df_posts.head()

Unnamed: 0,title,date,body,clean_body
0,Learn to Code: Python Workshop on 4/23,"Mar 31, 2022","According to LinkedIn, the “#1 Most Promising ...",according to linkedin the 1 most promising j...
1,Coming Soon: Cloud Administration,"Mar 17, 2022",We’re launching a new program out of San Anton...,were launching a new program out of san antoni...
2,5 Books Every Woman In Tech Should Read,"Mar 8, 2022",On this International Women’s Day 2022 we want...,on this international womens day 2022 we wante...
3,Codeup Start Dates for March 2022,"Jan 26, 2022",As we approach the end of January we wanted to...,as we approach the end of january we wanted to...
4,VET TEC Funding Now Available For Dallas Veterans,"Jan 7, 2022",We are so happy to announce that VET TEC benef...,we are so happy to announce that vet tec benef...


In [41]:
def tokenize(text):
    """
    Tokenize text
    """
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens
    

In [42]:
tokens = df_posts["clean_body"].apply(tokenize)
tokens.head()

0    [according, to, linkedin, the, 1, most, promis...
1    [were, launching, a, new, program, out, of, sa...
2    [on, this, international, womens, day, 2022, w...
3    [as, we, approach, the, end, of, january, we, ...
4    [we, are, so, happy, to, announce, that, vet, ...
Name: clean_body, dtype: object

In [43]:
def stem(tokens, use_tokens=False):
    """
    Stem tokens
    """
    stemmer = nltk.PorterStemmer()
    if use_tokens:
        stems = [stemmer.stem(token) for token in tokens]
    else:
        stems = [stemmer.stem(token) for token in tokens.split()]
    string = " ".join(stems)
    return string

In [44]:
stems = tokens.apply(stem, use_tokens=True)
stems.head()

0    accord to linkedin the 1 most promis job is da...
1    were launch a new program out of san antonio w...
2    on thi intern women day 2022 we want to tell s...
3    as we approach the end of januari we want to l...
4    we are so happi to announc that vet tec benefi...
Name: clean_body, dtype: object

In [45]:
def lemmatize(tokens, use_tokens=False):
    """
    Lemmatize tokens
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    if use_tokens:
        lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    else:
        lemmas = [lemmatizer.lemmatize(token) for token in tokens.split()]
    string = " ".join(lemmas)
    return string


In [47]:
lemmas = tokens.apply(lemmatize, use_tokens=True)
lemmas.head()

0    according to linkedin the 1 most promising job...
1    were launching a new program out of san antoni...
2    on this international woman day 2022 we wanted...
3    a we approach the end of january we wanted to ...
4    we are so happy to announce that vet tec benef...
Name: clean_body, dtype: object

In [48]:
def remove_stopwords(tokens, extra_stopwords=[], exclude_stopwords=[], use_tokens=False):
    """
    Remove stopwords from tokens
    """
    stop_words = stopwords.words('english')
    stop_words = set(stop_words).union(set(extra_stopwords))
    stop_words = set(stop_words) - set(exclude_stopwords)
    if use_tokens:
        tokens = [token for token in tokens if token not in stop_words]
        return tokens
    else:
        words = tokens.split()
        filtered_words = [word for word in words if word not in stop_words]
        string_without_stopwords = " ".join(filtered_words)
        return string_without_stopwords


In [49]:
without_stopwords = lemmas.apply(remove_stopwords, use_tokens=False)
without_stopwords.head()

0    according linkedin 1 promising job data scienc...
1    launching new program san antonio acquisition ...
2    international woman day 2022 wanted tell story...
3    approach end january wanted look forward next ...
4    happy announce vet tec benefit available used ...
Name: clean_body, dtype: object

In [57]:
news_df = acquire.get_inshorts_articles().astype(str)
news_df.head()

Using cached version of inshorts articles


Unnamed: 0,title,author,content,date,category
0,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,The Indian rupee fell to an all-time low of 77...,"09 May 2022,Monday",business
1,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in...","09 May 2022,Monday",business
2,Rupee closes at all-time low of 77.50 against ...,Pragya Swastik,The Indian rupee weakened further on Monday to...,"09 May 2022,Monday",business
3,Made best possible decision: IndiGo on barring...,Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...,"09 May 2022,Monday",business
4,India's biggest IPO of LIC subscribed nearly 3...,Pragya Swastik,"LIC's IPO, India's biggest IPO which opened on...","09 May 2022,Monday",business


In [54]:
codeup_df = acquire.get_all_posts().astype(str)
codeup_df.head()

Using cached version of posts


Unnamed: 0,title,date,body
0,Learn to Code: Python Workshop on 4/23,"Mar 31, 2022","According to LinkedIn, the “#1 Most Promising ..."
1,Coming Soon: Cloud Administration,"Mar 17, 2022",We’re launching a new program out of San Anton...
2,5 Books Every Woman In Tech Should Read,"Mar 8, 2022",On this International Women’s Day 2022 we want...
3,Codeup Start Dates for March 2022,"Jan 26, 2022",As we approach the end of January we wanted to...
4,VET TEC Funding Now Available For Dallas Veterans,"Jan 7, 2022",We are so happy to announce that VET TEC benef...


In [55]:
codeup_df.rename(columns={"body": "original"}, inplace=True)
codeup_df['clean'] = codeup_df['original'].apply(basic_clean)
codeup_df['stemmed'] = codeup_df['clean'].apply(stem)
codeup_df['lemmatized'] = codeup_df['clean'].apply(lemmatize)
codeup_df.head()

Unnamed: 0,title,date,original,clean,stemmed,lemmatized
0,Learn to Code: Python Workshop on 4/23,"Mar 31, 2022","According to LinkedIn, the “#1 Most Promising ...",according to linkedin the 1 most promising j...,accord to linkedin the 1 most promis job is da...,according to linkedin the 1 most promising job...
1,Coming Soon: Cloud Administration,"Mar 17, 2022",We’re launching a new program out of San Anton...,were launching a new program out of san antoni...,were launch a new program out of san antonio w...,were launching a new program out of san antoni...
2,5 Books Every Woman In Tech Should Read,"Mar 8, 2022",On this International Women’s Day 2022 we want...,on this international womens day 2022 we wante...,on thi intern women day 2022 we want to tell s...,on this international woman day 2022 we wanted...
3,Codeup Start Dates for March 2022,"Jan 26, 2022",As we approach the end of January we wanted to...,as we approach the end of january we wanted to...,as we approach the end of januari we want to l...,a we approach the end of january we wanted to ...
4,VET TEC Funding Now Available For Dallas Veterans,"Jan 7, 2022",We are so happy to announce that VET TEC benef...,we are so happy to announce that vet tec benef...,we are so happi to announc that vet tec benefi...,we are so happy to announce that vet tec benef...


In [59]:
news_df.rename(columns={"content": "original"}, inplace=True)
news_df['clean'] = news_df['original'].apply(basic_clean)
news_df['stemmed'] = news_df['clean'].apply(stem)
news_df['lemmatized'] = news_df['clean'].apply(lemmatize)
news_df.head()

Unnamed: 0,title,author,original,date,category,clean,stemmed,lemmatized
0,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,The Indian rupee fell to an all-time low of 77...,"09 May 2022,Monday",business,the indian rupee fell to an all time low of 77...,the indian rupe fell to an all time low of 77 ...,the indian rupee fell to an all time low of 77...
1,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in...","09 May 2022,Monday",business,bitcoin fell on monday to as low as 33 266 in...,bitcoin fell on monday to as low as 33 266 in ...,bitcoin fell on monday to a low a 33 266 in mo...
2,Rupee closes at all-time low of 77.50 against ...,Pragya Swastik,The Indian rupee weakened further on Monday to...,"09 May 2022,Monday",business,the indian rupee weakened further on monday to...,the indian rupe weaken further on monday to cl...,the indian rupee weakened further on monday to...
3,Made best possible decision: IndiGo on barring...,Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...,"09 May 2022,Monday",business,indigo s ceo ronojoy dutta said the airline ma...,indigo s ceo ronojoy dutta said the airlin mad...,indigo s ceo ronojoy dutta said the airline ma...
4,India's biggest IPO of LIC subscribed nearly 3...,Pragya Swastik,"LIC's IPO, India's biggest IPO which opened on...","09 May 2022,Monday",business,lic s ipo india s biggest ipo which opened on...,lic s ipo india s biggest ipo which open on ma...,lic s ipo india s biggest ipo which opened on ...


In [60]:
# 9. the smaller the dataset the better it is to use stemming versus lemmatization. as far as individual cases are concerned, do you have the time and the compute to lemmatize all the words in the dataset??
# in general lemmatize if you have time and compute