In [49]:
import re
import requests
import time
from collections import defaultdict

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [50]:
name_file = 'conn_bkm'
connections = pd.read_csv(name_file+'.csv')

In [51]:
cachedStopWords = stopwords.words("english")

def text_cleaning(x, cachedStopWords=cachedStopWords):
    """Clean text: remove non alphabetical words, stopwords and duplicate words"""
    x = re.sub(r'[-\\/()]', ' ', x)
    x = re.sub(r'[^a-zA-Z ]', '', x)
    x = x.lower()
    word_list =[]
    
    # make copy of stopwords
    words = list(cachedStopWords)
    for word in x.encode('utf-8').split():
        if word not in words:
            if word[-1]=='s':
                word = word[:-1]
                word = word.decode('utf-8')
                #word = word[1:]
                word_list.append(word)
                words += [word]
            else:
                word = word.decode('utf-8')
                #word = word[1:]
                word_list.append(word)
                words += [word]
    
    return ' '.join(word_list)


##clean the test
connections['Company'] = connections['Company'].map(str).map(lambda x: text_cleaning(x))

In [52]:
class jobListing():
    def __init__(self, country_url = 'http://www.monsterindia.com/'):
        self.url = country_url
        self.headers = {'User-Agent':  """Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"""}
        self.detail = defaultdict(list)
        
    def get_soups(self, q = '', loc = '', page=1):
        """Get html of jobs page
        Params
        -------
        q: search query can be company number or title or any word in description
        loc: location"""
        q = re.sub( '\s+', ' ', q ).strip()
        q = re.sub( ' ', '-', q )
        
        self.params_list = q + '-jobs-in-'+ loc + '-'+ str(page) + '.html'
        self.urls = self.url+self.params_list
            
        r = requests.get(self.urls, headers =self.headers )
        self.soup = BeautifulSoup(r.content, 'lxml')
        self.soup_posting = self.soup.find_all(attrs={'class':'jobwrap '})
        print(self.params_list)
        print(r.url)
    
    @staticmethod
    def _get_number(x, get_last = False):
        x = re.findall(r'\d+', x)
        return int(x[-1])
    
    def total_jobs(self):
        job_number = self.soup.find(attrs= {'class':'count pull-left'}).get_text().strip()
        no_jobs = self._get_number(job_number, get_last = True)
        print("Total jobs found {}.".format(no_jobs))
        self.pages = [i for i in range(0, no_jobs, 40)]
        
    def get_jobs(self):
        for job in self.soup_posting:
            self.detail['title'].append(job.find(attrs = {'class':'title_in'}).get_text().strip())
            self.detail['id'].append(job.get('id'))
            self.detail['job_url'].append(job.find(attrs = {'class':'title_in'}).get('href'))
            self.detail['company'].append(job.find(attrs = {'itemprop':'hiringOrganization'}).get_text().strip())
            try:
                self.detail['skills'].append(job.find(attrs = {'itemprop':'skills'}).get_text().strip())
            except:
                self.detail['skills'].append(None)
                
            try:
                self.detail['location'].append(job.find(attrs = {'itemprop':'jobLocation'}).get_text().strip())
            except:
                self.detail['location'].append(None)
                
            try:
                self.detail['experience'].append(job.find(attrs = {'itemprop':'experienceRequirements'}).get_text().strip())
            except:
                self.detail['experience'].append(None)
                
            try:
                self.detail['date'].append(job.find(attrs = {'itemprop':'datePosted'}).get_text().strip())
            except:
                self.detail['date'].append(None)
                
            self.detail['query'].append(self.params_list)
            
    def get_all_jobs(self, q = '', loc = '', page=1):
        """Get all jobs
        Params
        -------
        as_and: search query can be company number or title or any word in description
        as_ttl: search only if present in title
        as_cmp: company name
        salary: Rs90000 will search Rs90000+ or Rs40K-Rs50K
        l: location
        start: start of search label detail
        limit: no of job posting to display
        kwargs: https://www.indeed.co.in/advanced_search
            as_and=&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&
            jt=all&st=&salary=&radius=25&l=&fromage=any&
            limit=10&sort=&psf=advsrch"""
        
        #get number of pages in result
        self.get_soups(q = q, loc = loc, page=page)
        try:
            self.total_jobs()
            time.sleep(30)
            for i,j in enumerate(self.pages):
                print("Started scraping...")
                if i == 7:
                    continue
                self.get_soups(q = q, loc = loc, page=i+1)
                self.get_jobs()
        except:
            print('No jobs..\nTrying next item in list if any..\n '.format(self.params_list))

In [53]:
test.soup_posting[1].find(attrs = {'itemprop':'datePosted'}).get_text().strip()

'Posted : 2nd Aug 2017'

In [54]:
def get_all_jobs(q='data', loc='bengaluru-bangalore'):
    test =jobListing()
    test.get_soups(q = q, loc = loc, page=1)
    test.total_jobs()

    for i, _ in enumerate(test.pages):
        test.get_soups(q = q, loc = loc, page=i+1)
        test.get_jobs()
    
    return test.detail

all_jobs = get_all_jobs()

data-jobs-in-bengaluru-bangalore-1.html
http://www.monsterindia.com/data-jobs-in-bengaluru-bangalore-1.html
Total jobs found 1602.
data-jobs-in-bengaluru-bangalore-1.html
http://www.monsterindia.com/data-jobs-in-bengaluru-bangalore-1.html
data-jobs-in-bengaluru-bangalore-2.html
http://www.monsterindia.com/data-jobs-in-bengaluru-bangalore-2.html
data-jobs-in-bengaluru-bangalore-3.html
http://www.monsterindia.com/data-jobs-in-bengaluru-bangalore-3.html
data-jobs-in-bengaluru-bangalore-4.html
http://www.monsterindia.com/data-jobs-in-bengaluru-bangalore-4.html
data-jobs-in-bengaluru-bangalore-5.html
http://www.monsterindia.com/data-jobs-in-bengaluru-bangalore-5.html
data-jobs-in-bengaluru-bangalore-6.html
http://www.monsterindia.com/data-jobs-in-bengaluru-bangalore-6.html
data-jobs-in-bengaluru-bangalore-7.html
http://www.monsterindia.com/data-jobs-in-bengaluru-bangalore-7.html
data-jobs-in-bengaluru-bangalore-8.html
http://www.monsterindia.com/data-jobs-in-bengaluru-bangalore-8.html
data-

In [55]:
all_jobs.keys()

for i in all_jobs.keys():
    print(i,'>>>', len(set(all_jobs[i])))

title >>> 1059
id >>> 1262
job_url >>> 1264
company >>> 436
skills >>> 973
location >>> 34
experience >>> 137
date >>> 91
query >>> 38


In [None]:
%%capture output
jobs_india =joblisting()
for i in set(connections['Company']):
    jobs_india.get_all_jobs(as_cmp=i)
df = pd.DataFrame(jobs_india.detail)

In [None]:
df = pd.DataFrame(jobs_india.detail)
df.drop_duplicates('id', inplace=True)
df.to_excel(name_file+'.xlsx')
df.shape

In [1]:
df.head()

NameError: name 'df' is not defined

---