In [30]:
# Importing modules
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os


In [31]:
# Scraping News Articles for Data retrieval
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'news_headline': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df


In [32]:
news_df = build_dataset(seed_urls)
news_df.head(8)

Unnamed: 0,news_headline,news_article,news_category
0,World's smallest computer smaller than a grain...,University of Michigan has regained the title ...,technology
1,Uber driver was watching TV when self-driving ...,The safety driver of a self-driving Uber car t...,technology
2,My enemies tried to kill me: Anti-virus pionee...,"John McAfee, Founder of anti-virus software co...",technology
3,How does the world's smallest computer work?,The world's smallest computer created by the U...,technology
4,Intel's Brian Krzanich to give up $45 mn by re...,By resigning as Intel's CEO after violating a ...,technology
5,Facebook accidentally leaks sensitive data to ...,Data scandal-hit Facebook has accidentally lea...,technology
6,Who were the CEOs fired for having an affair w...,Intel CEO Brian Krzanich resigned on Thursday ...,technology
7,IIT Roorkee making drones to monitor rail tracks,IIT Roorkee is developing drones for the India...,technology


In [33]:
# Total number of news_articles
news_df.news_category.value_counts()

sports        25
technology    25
world         25
Name: news_category, dtype: int64

In [42]:
# Text Wrangling and Pre-Processing
import nltk
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
import re
import contractions #import CONTRACION_MAP
import unicodedata
from __future__ import unicode_literals


In [35]:
nlp = spacy.load('en_core', parse = True, tag = True, entity = True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [36]:
# Removing HTML tags
def strip_html_tags(text):
    soup = BeautifulSoup(text,"html.parser")
    stripped_text = soup.get_text()
    return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

u'Some important text'

In [37]:
# Removing accented characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text
# remove_accented_chars('Some Accéntedízed text')

In [38]:
# expanding contractions
# contractions are the shortened version of worrds or syllables. example: don't

def expand_contractions(text, contraction_mapping = contractions.contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

expand_contractions("Y'all can't expand contractions I'd think")

'You all cannot expand contractions I would think'

In [39]:
# removing special characters
def remove_special_characters(text, remove_digits = False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern,'',text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits= True)

'Well this was fun What do you think '

In [46]:
# STemming is the way of represing a word to its word stem example JUMPING on stemming is JUMP.
# New affixes can be addded to such words to create new words such as 'ED' = JUMPED\

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing! his crashed yesterday, ours crashes daily")

u'My system keep crashing! hi crash yesterday, our crash daili'

In [47]:
# Lemmatization that is the word like root word produced by stemming but will be present in dictionary
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

u'My system keep crash ! his crash yesterday , ours crash daily'

In [50]:
# removing stopwords i.e. the words that have little or no significance lke a, an, the, and.
def remove_stopwords(text, is_lower_case = False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
        
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
remove_stopwords("The, and, if are stopwords, computer is not")

u', , stopwords , computer not'