In [20]:
import pandas as pd
import numpy as np
import unicodedata
import re
import nltk
import os
import json

Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

Lowercase everything
Normalize unicode characters
Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
 def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', 
                          string).encode('ascii', 'ignore').decode('utf-8')
    string = re.sub(r'[^a-z0-9\s]', '', string)
    
    return string

In [3]:
test_string = "I once looked to the hill and saw upon it a GREAT FIRE. Life was lain low before it and made to be death in its wake."

output_test = basic_clean(test_string)
output_test

'i once looked to the hill and saw upon it a great fire life was lain low before it and made to be death in its wake'

Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [4]:
def tokenizer(string):
    tokenize = nltk.tokenize.ToktokTokenizer()
    string = tokenize.tokenize(string)
    return string

In [5]:
output_test = tokenizer(output_test)
output_test

['i',
 'once',
 'looked',
 'to',
 'the',
 'hill',
 'and',
 'saw',
 'upon',
 'it',
 'a',
 'great',
 'fire',
 'life',
 'was',
 'lain',
 'low',
 'before',
 'it',
 'and',
 'made',
 'to',
 'be',
 'death',
 'in',
 'its',
 'wake']

Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [6]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    string = [ps.stem(word) for word in string]
    return string

In [7]:
output_test = stem(output_test)
output_test

['i',
 'onc',
 'look',
 'to',
 'the',
 'hill',
 'and',
 'saw',
 'upon',
 'it',
 'a',
 'great',
 'fire',
 'life',
 'wa',
 'lain',
 'low',
 'befor',
 'it',
 'and',
 'made',
 'to',
 'be',
 'death',
 'in',
 'it',
 'wake']

Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [8]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fullspectrum/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fullspectrum/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/fullspectrum/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
def lemmatizer(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lems = [wnl.lemmatize(word) for word in string]
    return lems

In [10]:
output_test = lemmatizer(output_test)
output_test

['i',
 'onc',
 'look',
 'to',
 'the',
 'hill',
 'and',
 'saw',
 'upon',
 'it',
 'a',
 'great',
 'fire',
 'life',
 'wa',
 'lain',
 'low',
 'befor',
 'it',
 'and',
 'made',
 'to',
 'be',
 'death',
 'in',
 'it',
 'wake']

Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [11]:
from nltk.corpus import stopwords

In [12]:
stopwords_english = stopwords.words('english')
stopwords_english[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [13]:

no_stop = [word for word in output_test if word not in stopwords_english]

' '.join(no_stop)


'onc look hill saw upon great fire life wa lain low befor made death wake'

In [14]:
def remove_stopwords(string, extra_words = [], exclue_words = []):
    stopwords_english = stopwords.words('english')
    stopwords_english = set(stopwords_english) - set(exclude_words)
    string = [word for word in string if word not in stopwords_english]
    string = ' '.join(string)
    return string



In [15]:
output_test = remove_stopwords(output_test)
output_test

'onc look hill saw upon great fire life wa lain low befor made death wake'

Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [21]:
def get_news_articles(topic_list):
    file = 'news_articles.json'
    if os.path.exists(file):
        with open(file) as f:
            return json.load(f)
    final_list = []
    for topic in topic_list:
        final_list.extend(scrap_one_page(topic))
    with open(file, 'w') as f:
        json.dump(final_list, f)
    return final_list

In [22]:
list_topics = ['business', 'sports', 'technology', 'entertainment']

final_list = get_news_articles(list_topics)

In [23]:
news_df = pd.DataFrame(final_list)
news_df

Unnamed: 0,category,title,content
0,business,Sachin Tendulkar and his wife meet Bill Gates;...,Former cricketer Sachin Tendulkar and his wife...
1,business,"'Best wishes to my classmate,' writes Gates in...",Businessman Anand Mahindra on Tuesday met Micr...
2,business,Ambanis should get Z+ security cover across In...,The Supreme Court on Tuesday stated that the Z...
3,business,Apple supplier Foxlink's fire safety systems f...,Most of the fire safety equipment at Apple sup...
4,business,People consuming 30 GB and paying almost nothi...,Bharti Airtel is looking to raise mobile phone...
...,...,...,...
95,entertainment,"Courteney gets star on H'wood Walk of Fame, An...",Actress Courteney Cox was honoured with a star...
96,entertainment,Reactions to trailer are too special: Rani on ...,Actress Rani Mukerji said that the reactions t...
97,entertainment,It's been pretty psychotic: Shahid Kapoor on h...,Shahid Kapoor spoke about his journey as an ac...
98,entertainment,Was nervous dancing with Sridevi & Urmila: Ani...,Marking 26 years since the release of his film...


In [24]:
news_df['content'].apply(basic_clean)\
.apply(tokenizer)\
.apply(lemmatizer)\
.apply(remove_stopwords)

0     former cricketer sachin tendulkar wife anjali ...
1     businessman anand mahindra tuesday met microso...
2     supreme court tuesday stated z security cover ...
3     fire safety equipment apple supplier foxlinks ...
4     bharti airtel looking raise mobile phone call ...
                            ...                        
95    actress courteney cox wa honoured star hollywo...
96    actress rani mukerji said reaction trailer upc...
97    shahid kapoor spoke journey actor calling pret...
98    marking 26 year since release film judaai anil...
99    singer prateek kuhad revealed bollywood direct...
Name: content, Length: 100, dtype: object

a. lem
b. stem
c. stem if you're on time and money