In [1]:
import re
import requests
import math
import time
from collections import defaultdict

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [2]:
name_file = 'conn_bkm'
connections = pd.read_csv(name_file+'.csv')

In [3]:
cachedStopWords = stopwords.words("english")

def text_cleaning(x, cachedStopWords=cachedStopWords):
    """Clean text: remove non alphabetical words, stopwords and duplicate words"""
    x = re.sub(r'[-\\/()]', ' ', x)
    x = re.sub(r'[^a-zA-Z ]', '', x)
    x = x.lower()
    word_list =[]
    
    # make copy of stopwords
    words = list(cachedStopWords)
    for word in x.encode('utf-8').split():
        if word not in words:
            if word[-1]=='s':
                word = word[:-1]
                word = word.decode('utf-8')
                #word = word[1:]
                word_list.append(word)
                words += [word]
            else:
                word = word.decode('utf-8')
                #word = word[1:]
                word_list.append(word)
                words += [word]
    
    return ' '.join(word_list)


##clean the test
connections['Company'] = connections['Company'].map(str).map(lambda x: text_cleaning(x))

In [4]:
class jobListing():
    def __init__(self):
        self.url = 'https://www.careerbuilder.co.in/jobsearch'
        self.headers = {'User-Agent':  """Mozilla/5.0"""}
        self.detail = defaultdict(list)
        
    def get_soups(self, q = '',loc = '',pg=1, **kwargs ):
        """Get html of jobs page
        Params
        -------
        q: search query can be company number or title or any word in description
        loc: location
        kwargs: https://www.careerbuilder.co.in/jobsearch"""
        
        params = locals()
        params.pop('self', None)
        
        if 'kwargs' in params.keys():
            params.pop('kwargs',None)
            self.params = {**params, **kwargs}
            
        self.params_list = ''
        for i, j in self.params.items():
            if j !='':
                self.params_list += i + ': ' + str(j) + ' | '
            
        r = requests.get(self.url, headers =self.headers, params=self.params )
        self.soup = BeautifulSoup(r.content, 'lxml')
        self.soup_posting = self.soup.find_all(attrs={'class':'job-info'})
        print(self.params_list)
        print(r.url)
    
    @staticmethod
    def _get_number(x, get_last = False):
        x = re.findall(r'\d+', x)
        return int(x[0])
    
    def total_jobs(self):
        job_number = self.soup.find(attrs= {'class':'top-bloc with-button-filter'}).get_text().strip()
        no_jobs = self._get_number(job_number, get_last = True)
        print("Total jobs found {}.".format(no_jobs))
        if no_jobs>50:
            self.pages = [i for i in range(1,math.ceil(no_jobs/50)+1)]
        elif no_jobs == 0:
            self.pages = 0
        else:
            self.pages = 0
        
    def get_jobs(self):
        for job in self.soup_posting:
            self.detail['title'].append(job.find(attrs = {'itemprop':'title'}).get_text().strip())
            self.detail['employmentType'].append(job.find(attrs = {'itemprop':'employmentType'}).get_text().strip())
            self.detail['job_url'].append(job.find(attrs = {'class':'job-title'}).get('href'))
            self.detail['company'].append(job.find(attrs = {'itemprop':'hiringOrganization'}).get_text().strip())
            self.detail['salary'].append(job.find(attrs = {'itemprop':'baseSalary'}).get_text().strip())
            self.detail['location'].append(job.find(attrs = {'itemprop':'jobLocation'}).get_text().strip())
            self.detail['summary'].append(job.find(attrs = {'class':'description'}).get_text().strip())
            date = job.find(attrs = {'class':'date'}).get_text().strip()
            self.detail['date'].append(self._get_number(date))
            self.detail['query'].append(self.params_list)
            self.detail['company_q'].append(self.params['q'])
            
    def get_all_jobs(self,  q = '',loc = '',pg=1, **kwargs):
        
        #get number of pages in result
        self.get_soups(q=q, loc = loc, pg = pg)
        try:
            self.total_jobs()
#             time.sleep(1)
            if self.pages == 0:
                print('No jobs..\nTrying next item in list if any..\n '.format(self.params_list))
            else:
                for i in self.pages:
                    print("Started scraping...")
                    self.get_soups(q=q, loc = loc, pg = i)
                    self.get_jobs()
        except:
            print('No jobs..\nTrying next item in list if any..\n '.format(self.params_list))

In [5]:
jobs_india =jobListing()
for i in set(connections['Company']):
    jobs_india.get_all_jobs(q=i)
df = pd.DataFrame(jobs_india.detail)

pg: 1 | 
https://www.careerbuilder.co.in/jobsearch?pg=1&loc=&q=
No jobs..
Trying next item in list if any..
 
pg: 1 | q: bnp paribas corporate and institutional banking | 
https://www.careerbuilder.co.in/jobsearch?pg=0&q=bnp+paribas+corporate+and+institutional+banking
Total jobs found 0.
No jobs..
Trying next item in list if any..
 
pg: 1 | q: cloudfactory | 
https://www.careerbuilder.co.in/jobsearch?pg=0&q=cloudfactory
Total jobs found 0.
No jobs..
Trying next item in list if any..
 
pg: 1 | q: naseeb online services pvt ltd | 
https://www.careerbuilder.co.in/jobsearch?pg=0&q=naseeb+online+services+pvt+ltd
Total jobs found 0.
No jobs..
Trying next item in list if any..
 
pg: 1 | q: jagadamba motors heavy equipment | 
https://www.careerbuilder.co.in/jobsearch?pg=0&q=jagadamba+motors+heavy+equipment
Total jobs found 0.
No jobs..
Trying next item in list if any..
 
pg: 1 | q: jamboree education private limited | 
https://www.careerbuilder.co.in/jobsearch?pg=0&q=jamboree+education+private

In [6]:
df = pd.DataFrame(jobs_india.detail)
df.drop_duplicates('id', inplace=True)
df.to_excel(name_file+'.xlsx')
df.shape

KeyError: 'id'

In [7]:
df

Unnamed: 0,company,company_q,date,employmentType,job_url,location,query,salary,summary,title
0,AzureTeam Software development India Pvt. Ltd.,microsoft,6,Full-Time Employee,/jdp/microsoft-dynamics-365-backend-developer-...,"Ahmedabad, Gujarat, India\n\n\nGUJARAT",pg: 1 | q: microsoft |,,Immediate opening for Microsoft Dynamics 365 b...,Microsoft Dynamics 365 Backend Developer
1,Job Finder,microsoft,13,Full-Time Employee,/jdp/technical-voice-support----microsoft-365-...,Pune\n\n\nMAHARASHTRA,pg: 1 | q: microsoft |,,TOP BPO Microsoft 365 Process Technical Voice ...,Technical Voice Support - Microsoft 365
2,Job Finder,microsoft,13,Full-Time Employee,/jdp/technical-voice-support----microsoft-365-...,Gurugram\n\n\nHARYANA,pg: 1 | q: microsoft |,,TOP BPO Microsoft 365 Process Technical Voice ...,Technical Voice Support - Microsoft 365
3,Job Finder,microsoft,13,Full-Time Employee,/jdp/microsoft-365----technical-voice-support-...,Pune\n\n\nMAHARASHTRA,pg: 1 | q: microsoft |,,TOP BPO Microsoft 365 Process Technical Voice ...,Microsoft 365 - Technical Voice Support
4,Job Finder,microsoft,27,Full-Time Employee,/jdp/technical-voice-support----microsoft-offi...,Gurugram\n\n\nHARYANA,pg: 1 | q: microsoft |,,Microsoft Office 365 / Microsoft EPS Hirings T...,Technical Voice Support - Microsoft Office 365
5,Job Finder,microsoft,28,Full-Time Employee,/jdp/microsoft-365----technical-voice-support-...,Gurugram\n\n\nHARYANA,pg: 1 | q: microsoft |,,TOP BPO Microsoft 365 Process Technical Voice ...,Microsoft 365 - Technical Voice Support
6,Job Finder,microsoft,28,Full-Time Employee,/jdp/microsoft-365----technical-voice-support-...,Pune\n\n\nMAHARASHTRA,pg: 1 | q: microsoft |,,TOP BPO Microsoft 365 Process Technical Voice ...,Microsoft 365 - Technical Voice Support
7,Job Finder,microsoft,28,Full-Time Employee,/jdp/technical-voice-support----microsoft-365-...,Pune\n\n\nMAHARASHTRA,pg: 1 | q: microsoft |,,TOP BPO Microsoft 365 Process Technical Voice ...,Technical Voice Support - Microsoft 365
8,Job Finder,microsoft,28,Full-Time Employee,/jdp/technical-voice-support----microsoft-365-...,Gurugram\n\n\nHARYANA,pg: 1 | q: microsoft |,,TOP BPO Microsoft 365 Process Technical Voice ...,Technical Voice Support - Microsoft 365
9,Hexahash Technologies pvt ltd,microsoft,6,Full-Time Employee,/jdp/microsoft-dynamics-crm----senior-level-co...,"Hyderabad, Telangana, India\n\n\nTELANGANA",pg: 1 | q: microsoft |,,Microsoft Dynamics CRM is a major component of...,Microsoft Dynamics CRM - Senior Level Consultant


---