In [1]:
import re
import requests
import math
import time
from collections import defaultdict

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [2]:
name_file = 'conn_bkm'
connections = pd.read_csv(name_file+'.csv')

In [3]:
cachedStopWords = stopwords.words("english")

def text_cleaning(x, cachedStopWords=cachedStopWords):
    """Clean text: remove non alphabetical words, stopwords and duplicate words"""
    x = re.sub(r'[-\\/()]', ' ', x)
    x = re.sub(r'[^a-zA-Z ]', '', x)
    x = x.lower()
    word_list =[]
    
    # make copy of stopwords
    words = list(cachedStopWords)
    for word in x.encode('utf-8').split():
        if word not in words:
            if word[-1]=='s':
                word = word[:-1]
                word = word.decode('utf-8')
                #word = word[1:]
                word_list.append(word)
                words += [word]
            else:
                word = word.decode('utf-8')
                #word = word[1:]
                word_list.append(word)
                words += [word]
    
    return ' '.join(word_list)


##clean the test
connections['Company'] = connections['Company'].map(str).map(lambda x: text_cleaning(x))

In [4]:
class jobListing():
    def __init__(self):
        self.url = 'https://www.glassdoor.co.in/Job/india-{}--SRCH_IL.0,5_IN115_KE6,10_IP{}.htm'
        self.headers = {'User-Agent':  """Mozilla/5.0"""}
        self.detail = defaultdict(list)
        
    def get_soups(self, q = '', page=1):
        """Get html of jobs page
        Params
        -------
        q: search query can be company number or title or any word in description
        """
        
        self.params = '-'.join(q.split())
        self.params_list = self.params
            
        r = requests.get(self.url.format(self.params, page), headers =self.headers )
        self.soup = BeautifulSoup(r.content, 'lxml')
        self.soup_posting = self.soup.find_all(attrs={'class':re.compile(r'jl|jl selected')})
        print(self.params_list)
        print(r.url)
    
    @staticmethod
    def _get_number(x, get_last = False):
        x = re.findall(r'\d+', x)
        return int(x[0])
    
    def total_jobs(self):
        job_number = self.soup.find(attrs= {'class':'jobsCount'}).get_text().strip()
        no_jobs = self._get_number(job_number, get_last = True)
        print("Total jobs found {}.".format(no_jobs))
        if no_jobs>31:
            self.pages = [i for i in range(1,math.ceil(no_jobs/31)+1)]
        elif no_jobs == 0:
            self.pages = 0
        else:
            self.pages = 0
        
    def get_jobs(self):
        for job in self.soup_posting:
            self.detail['id'].append(job.get('data-id'))
            self.detail['id_company'].append(job.get('data-emp-id'))
            self.detail['title'].append(job.find(attrs={'class':'flexbox'}).find(attrs = {'class':'jobLink'}).get_text().strip())
            self.detail['job_url'].append(job.find(attrs={'class':'flexbox'}).find(attrs = {'class':'jobLink'}).get('href'))
            self.detail['location'].append(job.find(attrs = {'class':'subtle loc'}).get_text().strip())
            self.detail['rating'].append(job.find(attrs = {'class':'compactStars '}).get_text().strip())
            self.detail['date'].append(job.find(attrs = {'class':re.compile(r'showHH nowrap|hotListing')}).get_text().strip())
            self.detail['query'].append(self.params_list)
            self.detail['company_q'].append(self.params_list)
            
    def get_all_jobs(self,  q = '', page=1):
        
        #get number of pages in result
        self.get_soups(q=q, page=1)
        try:
            self.total_jobs()
#             time.sleep(1)
            if self.pages == 0:
                print('No jobs..\nTrying next item in list if any..\n '.format(self.params_list))
            else:
                for i in self.pages:
                    print("Started scraping...")
                    self.get_soups(q=q, page=i)
                    self.get_jobs()
        except:
            print('No jobs..\nTrying next item in list if any..\n '.format(self.params_list))

In [5]:
test = jobListing()
test.get_soups(q='uber')
test.get_jobs()
len(test.detail['id'])
pd.DataFrame(test.detail)

uber
https://www.glassdoor.co.in/Job/india-uber-jobs-SRCH_IL.0,5_IN115_KE6,10.htm


Unnamed: 0,company_q,date,id,id_company,job_url,location,query,rating,title
0,uber,New,,,/partner/jobListing.htm?pos=101&ao=134571&s=58...,Hyderabad,uber,4.3,Software Engineer
1,uber,New,2461147828.0,575263.0,/partner/jobListing.htm?pos=101&ao=134571&s=58...,Hyderabad,uber,4.3,Software Engineer
2,uber,New,2534696306.0,575263.0,/partner/jobListing.htm?pos=102&ao=101635&s=58...,India,uber,4.3,Enterprise Account Executive
3,uber,6 d,2507449991.0,575263.0,/partner/jobListing.htm?pos=103&ao=101637&s=58...,Bengaluru,uber,4.3,Sr Software Engineer (Frontend)
4,uber,HOT,2474252384.0,575263.0,/partner/jobListing.htm?pos=104&ao=134571&s=58...,Bengaluru,uber,4.3,Software Engineer II
5,uber,8 d,2528316352.0,575263.0,/partner/jobListing.htm?pos=105&ao=101637&s=58...,Bengaluru,uber,4.3,Software Engineer 2 - Frontend Engineer
6,uber,12 d,2525542078.0,575263.0,/partner/jobListing.htm?pos=106&ao=101637&s=58...,India,uber,4.3,Data Science Manager - Advanced Analytics
7,uber,27 d,2513527554.0,575263.0,/partner/jobListing.htm?pos=107&ao=134571&s=58...,Hyderabad,uber,4.3,Business Analyst - COE
8,uber,New,2533817069.0,575263.0,/partner/jobListing.htm?pos=108&ao=101635&s=58...,India,uber,4.3,"Sales Manager, UberEATS - Bangalore"
9,uber,4 d,2484110382.0,575263.0,/partner/jobListing.htm?pos=109&ao=134571&s=58...,Gurgaon,uber,4.3,Community Operations Manager - RCO


In [7]:
jobs_india =jobListing()
#connections['Company']
for i in set(['uber', 'ola']):
    jobs_india.get_all_jobs(q=i)
df = pd.DataFrame(jobs_india.detail)
jobs_india.detail

uber
https://www.glassdoor.co.in/Job/india-uber-jobs-SRCH_IL.0,5_IN115_KE6,10.htm
Total jobs found 87.
Started scraping...
uber
https://www.glassdoor.co.in/Job/india-uber-jobs-SRCH_IL.0,5_IN115_KE6,10.htm
Started scraping...
uber
https://www.glassdoor.co.in/Job/india-uber-jobs-SRCH_IL.0,5_IN115_KE6,10_IP2.htm
Started scraping...
uber
https://www.glassdoor.co.in/Job/india-uber-jobs-SRCH_IL.0,5_IN115_KE6,10_IP3.htm
ola
https://www.glassdoor.co.in/Job/india-ola-jobs-SRCH_IL.0,5_IN115_KE6,9.htm
Total jobs found 196.
Started scraping...
ola
https://www.glassdoor.co.in/Job/india-ola-jobs-SRCH_IL.0,5_IN115_KE6,9.htm
No jobs..
Trying next item in list if any..
 


ValueError: arrays must all be same length

In [None]:
df = pd.DataFrame(jobs_india.detail)
df.drop_duplicates('id', inplace=True)
df.to_excel(name_file+'.xlsx')
df.shape

In [8]:
jobs_india.pages

[1, 2, 3, 4, 5, 6, 7]

---