# Topics
- Web scraping
- Text summarization
- Topic Modeling
- Named Entity Recognition
- Word2vec

In [2]:
import pandas as pd
import nltk
import numpy as np
from bs4 import BeautifulSoup
import requests

In [14]:
url = 'https://www.monsterindia.com/data-science-jobs.html'
page = requests.get(url).text
type(page)

str

In [17]:
bs = BeautifulSoup(page, 'html.parser')

In [22]:
bs.title.text

'Data Science Jobs - 118 Data Science Job Vacancies - Monster India'

In [34]:
jobs = bs.findAll('div', {'class': 'jobwrap', 'type': 'tuple'})

In [48]:
jobs_df = pd.DataFrame(columns=['title', 'company', 'location',
                                'exp', 'posted_at', 'skills'])
for job in jobs:
    title = job.find('span', {'class': 'title_in'}).text
    company = job.find('a', {'class': 'jtxt orange'}).get('title')
    location = job.find('div', {'class': 'jtxt jico ico1'}).text
    exp = job.find('div', {'class': 'jtxt jico ico2'}).text
    posted_at = job.find('div', {'class': 'job_optitem ico7',
                                 'itemprop':'datePosted'}).text
    skills = job.find('div', {'class': 'joblnk serachjoblnk'}).findAll('div')[4].get('title')
    #print(skills)
    curr_job = {'title': title,
                'company': company,
                'location': location,
                'experience': exp,
                'posted_at': posted_at,
                'skills': skills
               }
    jobs_df = jobs_df.append(curr_job, ignore_index=True)
jobs_df.to_csv('monster.csv', index=False)

In [67]:
def parse_url(url):
    page = requests.get(url).text
    bs = BeautifulSoup(page, 'html.parser')
    return (bs)

def get_parents(bs):
    return (bs.findAll('div', {'class': 'jobwrap', 'type': 'tuple'}))

def get_attributes(job):
    title = job.find('span', {'class': 'title_in'}).text
    company = job.find('a', {'class': 'jtxt orange'}).get('title')
    location = job.find('div', {'class': 'jtxt jico ico1'}).text
    exp = job.find('div', {'class': 'jtxt jico ico2'}).text
    posted_at = job.find('div', {'class': 'job_optitem ico7',
                                 'itemprop':'datePosted'}).text
    skills = job.find('div', {'class': 'joblnk serachjoblnk'}).findAll('div')[4].get('title')
    curr_job = {'title': title,
                'company': company,
                'location': location,
                'experience': exp,
                'posted_at': posted_at,
                'skills': skills
               }
    return curr_job

jobs_df = pd.DataFrame(columns=['title', 'company', 'location',
                                'exp', 'posted_at', 'skills'])

for i in range(1,6):
    url = 'https://www.monsterindia.com/data-science-jobs-%d.html' % i
    print(url)
    bs = parse_url(url)
    parents = get_parents(bs)
    for job in parents:
        try:
            jobs_df = jobs_df.append(get_attributes(job), ignore_index=True)
        except AttributeError:
            pass
jobs_df.shape

https://www.monsterindia.com/data-science-jobs-1.html
https://www.monsterindia.com/data-science-jobs-2.html
https://www.monsterindia.com/data-science-jobs-3.html
https://www.monsterindia.com/data-science-jobs-4.html
https://www.monsterindia.com/data-science-jobs-5.html


(82, 7)

## Text Summarization

In [None]:
!pip install sumy

In [70]:
amazon = pd.read_csv('https://raw.githubusercontent.com/skathirmani/datasets/master/amazon_reviews.csv')

In [81]:
#nltk.download('punkt')

In [82]:
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

In [87]:
parser = PlaintextParser.from_string(amazon['reviewText'][0], Tokenizer("english"))
summarizer = LexRankSummarizer()
sentences = summarizer(parser.document, 3)
#sentences

In [85]:
url = 'https://www.yahoo.com/movies/rss'
page = requests.get(url).text
bs = BeautifulSoup(page, 'lxml')

In [99]:
parents = bs.findAll('item')
# len(parents) #output: 50

In [None]:
## Step 1: Read the url and convert that to BS object
    bs = parse_url(article_link)
    
    ## Step 2: Get all the paragraph tags in a list using findAll('p')
    paras = bs.findAll('p')
    
    ## Step 3: Loop through each paragraph and join the strings
    
    ## Step 4: Using lexrank summarizer to summarize the article in 3 sentences
    
    ## Step 5: Print the sentences

In [110]:
summarizer = LexRankSummarizer()
for item in parents:
    title = item.find('title').text
    desc = item.find('description').text
    bs_desc = BeautifulSoup(desc, 'html.parser')
    article_link = bs_desc.find('a').get('href')
    
    bs = parse_url(article_link)
    ptags = bs.findAll('p')
    article_content = '. '.join([para.text for para in ptags])
    
    parser = PlaintextParser.from_string(article_content,
                                         Tokenizer("english"))
    sentences = summarizer(parser.document, 3)
    #print ('------------TITLE---------------')
    #print(title)
    #print('------SUMMARY------')
    #print(sentences)
    #print('------------------END--------------------------')

## Word2vec
- DTM
    - Sparse matrix
    - High dimensional

In [1]:
import gensim



In [2]:
# Extract GoogleNews....bin.gz
path = 'GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(path,
                                                        binary=True)
model

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1122f145da0>

In [116]:
len(model.get_vector('computer'))

300

In [None]:
model.most_similar('computer', topn=5)