<a href="https://colab.research.google.com/github/AgrawalHimanshi/ML-NLP/blob/master/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [0]:
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

In [0]:

def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'news_headline': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [9]:
news_df=build_dataset(seed_urls)
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,Jio GigaFiber to launch on Sept 5; plans to st...,Mukesh Ambani has announced that Jio GigaFiber...,technology
1,Jio to set up data centres across India in par...,At the annual general meeting of Reliance Indu...,technology
2,Knives sold on FB without age check despite ba...,Knives are being sold on Facebook's e-commerce...,technology
3,YouTube moderators claim it bends rules to fav...,YouTube allegedly favours some popular channel...,technology
4,Chennai firm builds attendance system with bre...,Chennai-based Ramco Systems has designed a fac...,technology
5,Google marks ISRO Founder's 100th birth annive...,Google on Monday celebrated the 100th birth an...,technology
6,TikTok owner ByteDance launches censored searc...,Lip-syncing video app TikTok's owner ByteDance...,technology
7,Researchers demo way to fool Apple's Face ID f...,Tencent Security researchers at the 2019 Black...,technology
8,Do not advertise 'illegal' events: Russia to G...,Russian state communications watchdog Roskomna...,technology
9,"Group sex app allegedly exposes White House, U...",A flaw in group sex app '3Fun' found by securi...,technology


In [10]:
news_df.news_category.value_counts()

technology    25
sports        25
world         25
Name: news_category, dtype: int64

**Text Wrangling and Pre-processing**

In [0]:
import spacy
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup

import unicodedata

In [12]:
 from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving contractions_py.py to contractions_py (1).py
User uploaded file "contractions_py.py" with length 3250 bytes


In [0]:
from contractions_py import CONTRACTION_MAP

In [14]:
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [15]:
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
nlp = spacy.load('en', parse = True, tag=True, entity=True)

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

**remove html tags**

In [23]:
def strip_html_tags(text):
  soup=BeautifulSoup(text,"html.parser")
  stripped_text=soup.get_text()
  return stripped_text

strip_html_tags('<html><h2>someone</h2></html>')


'someone'

**Remove accented character**

In [24]:
def remove_accented_chars(text):
  text=unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text
remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

**Expand contractions**

In [0]:
#def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
 # text=

In [28]:
def remove_special_characters(text, remove_digits=False):
  pattern=r'[a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
  text=re.sub(pattern,'',text)
  return text
remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

'Well this was fun What do you think '

In [30]:
def lemmatize_text(text):
  text=nlp(text)
  text=' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
  return text

lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crashed yesterday , ours crash daily'

**text stemming**

In [31]:
def simple_stemmer(text):
  ps=nltk.porter.PorterStemmer()
  text=' '.join([ps.stem(word) for word in text.split() ])
  return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

**Remove Stopwords **

In [32]:
def remove_stopwords(text, is_lower_case=False):
  tokens=tokenizer.tokenize(text)
  tokens=[token.strip() for token in tokens]
  if is_lower_case:
     filtered_tokens = [token for token in tokens if token not in stopword_list]
  else:
     filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
      
  filtered_text=' '.join(filtered_tokens)
  return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

In [0]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
  normalized_corpus=[]
  for doc in corpus:
    if html_stripping:
      doc=strip_html_tags(doc)
    # remove accented characters
    if accented_char_removal:
      doc=remove_accented_chars(doc)
     # expand contractions    
    if contraction_expansion:
      doc = expand_contractions(doc)
    #text lowercase
    if text_lower_case:
      doc=doc.lower()
    # remove extra newlines
    doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
     # lemmatize text
    if text_lemmatization:
      doc = lemmatize_text(doc)