In [19]:
from jobspy import scrape_jobs
import pandas as pd
from IPython.display import display, HTML
from datetime import datetime
from pymongo import MongoClient
from pydantic import BaseModel
from typing import Optional

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

def scrape():
    jobs = scrape_jobs(
        site_name=["linkedin", "glassdoor", "zip_recruiter", "indeed"],
        search_term="",
        location='USA',
        hyperlinks=True,
        is_remote=True,
        results_wanted=10,
    )
    return jobs

In [17]:
import re
from typing import Optional
from typing import Any

class Job(BaseModel):
    jobId: str
    title: str
    location: Any
    company: str
    link: str
    jobDescription: Optional[str]
    scapetime: datetime
    jobPosted: datetime

# Function to connect to the MongoDB database
def connect_to_database():
    client = MongoClient('mongodb://localhost:27017/')
    db = client['glassdoor']
    collection = db['jobs']
    return collection

# Function to check for duplicate records in MongoDB
def check_duplicate(job_id):
    collection = connect_to_database()
    return collection.find_one({"jobId": job_id}) is not None

# Extracting jobId from job URLs
def get_job_id(job_url: str, site: str) -> str:
    try:
        if site == 'indeed':
            job_id = re.search(r'(?<=jk=)[\w-]+', job_url).group()
        elif site == 'zip_recruiter':
            job_id = re.search(r'(?<=lvk=)[\w-]+', job_url).group()
        elif site == 'linkedin':
            job_id = re.search(r'(?<=view/)\d+', job_url).group()
        elif site == 'glassdoor':
            job_id = re.search(r'(?<=jl=)\d+', job_url).group()
        else:
            job_id = None
        return job_id
    except Exception as e:
        print(job_url)
        print(site)
        print(str(e))


# Function to get the current datetime
def current_datetime():
    return datetime.now()


In [20]:

def save_to_mongodb(job : Job):
    collection = connect_to_database()
    item_dict = job.dict()
    if not check_duplicate(item_dict['jobId']):
        collection.insert_one(item_dict)
        print(f"inserted=>{item_dict['jobId']}")
        return True
    return False

jobs = scrape()


2024-04-22 01:53:21,508 - JobSpy - INFO - LinkedIn search page: 1
2024-04-22 01:53:21,509 - JobSpy - INFO - Indeed search page: 1
2024-04-22 01:53:21,674 - JobSpy - INFO - ZipRecruiter search page: 1
2024-04-22 01:53:22,348 - JobSpy - INFO - Indeed finished scraping
2024-04-22 01:53:22,348 - JobSpy - INFO - ZipRecruiter finished scraping
2024-04-22 01:53:23,592 - JobSpy - INFO - Glassdoor search page: 1
2024-04-22 01:53:26,666 - JobSpy - INFO - Glassdoor finished scraping
2024-04-22 01:53:32,947 - JobSpy - INFO - Linkedin finished scraping


In [24]:

# # Assume jobs is your dataframe
for _, item in jobs.iterrows():
    try:
        df_item = item.to_dict()
        jobId = get_job_id(df_item['job_url_hyper'], df_item['site'])

        job = Job(
                jobId=jobId,
                title=df_item['title'],
                location=df_item['location'],
                company=df_item['company'],
                link=df_item['job_url_hyper'],
                jobDescription=df_item['description'],
                scapetime=current_datetime(),
                jobPosted=df_item['date_posted']
            )
        print(jobId)
#         print(job.dict())
        if not save_to_mongodb(job):
            pass
    except Exception as e:
        print(str(e))

1009247148035
1009247236374
1009245191473
1009187401258
inserted=>1009187401258
1009104134567
inserted=>1009104134567
1009100991693
inserted=>1009100991693
1009081152450
1009051881574
1008917604697
1008903441549
inserted=>1008903441549
ced6bbb9dc49160f
98c6f4f0cff7544c
34893914f9ea7594
12d3934d3d0fcbb8
e03be3cec74f45db
7720e9db7f001d0c
inserted=>7720e9db7f001d0c
11e19c0687823e2a
bac504b7fd10aaaa
01e98f9c84d98c60
inserted=>01e98f9c84d98c60
bfd0cd505b2a3352
inserted=>bfd0cd505b2a3352
1 validation error for Job
jobDescription
  Input should be a valid string [type=string_type, input_value=nan, input_type=float]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type
1 validation error for Job
jobDescription
  Input should be a valid string [type=string_type, input_value=nan, input_type=float]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type
1 validation error for Job
jobDescription
  Input should be a valid string [type=string_type, i

In [1]:
# for _, item in jobs.iterrows():
#     jobId = get_job_id(item['job_url_hyper'], item['site'])
#     print(jobId)
    

In [21]:
jobs

Unnamed: 0,site,job_url_hyper,job_url_direct,title,company,location,job_type,date_posted,interval,min_amount,max_amount,currency,is_remote,emails,description,company_url,company_url_direct,company_addresses,company_industry,company_num_employees,company_revenue,company_description,logo_photo_url,banner_photo_url,ceo_name,ceo_photo_url
20,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,Document Drafting Administrator,Confidential,"New York, NY",,2024-04-22,,,,,False,,"Our construction company is an Inc. 5,000 Busi...",,,,,,,,,,,
25,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,Receptionist,MVSI,,,2024-04-22,hourly,16.0,17.0,USD,True,,**About MVSI:** \n\nMVSI is a market leading c...,,,,,,,,,,,
28,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,IBM Maximo Developer,ITinfra,,,2024-04-20,yearly,100000.0,100000.0,USD,True,,"**Why ITinfra?**\n\n* Small, growing, dynamic ...",,,,,,,,,,,
26,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,"Associate Director, Accounts","Eighty Five Sixty, Inc.","Del Mar, CA",,2024-03-15,yearly,95000.0,105000.0,USD,False,HR@85SIXTY.com,"Remote, US-based \n**Overview:** \nFor over ...",https://www.glassdoor.com/Overview/W-EI_IE9429...,,,,,,,,,,
24,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,Power BI Principal Consultant,P3 Adaptive,,,2024-02-02,yearly,100000.0,130000.0,USD,True,,**P3 Adaptive is seeking qualified candidates ...,https://www.glassdoor.com/Overview/W-EI_IE6748...,,,,,,,,,,
29,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,DAT (Dental Admissions Test) Tutor,Inspira Education Group,,,2024-02-01,hourly,45.0,45.0,USD,True,,Inspira's mission is to democratize access to ...,,,,,,,,,,,
21,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,Product Owner,AmTote International Inc.,,,2024-01-19,,,,,True,,Ready to take a different path? Passionate abo...,https://www.glassdoor.com/Overview/W-EI_IE1982...,,,,,,,,,,
27,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,Tax Specialist - Trusts and Individuals,"Brotemarkle, Davis & Co. LLP",,,2023-12-30,yearly,105000.0,125000.0,USD,True,,**Job description**\n\nIf you are looking for ...,,,,,,,,,,,
22,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,CabinetVision Engineer,Ellis and Company,,,2023-10-11,yearly,65000.0,92500.0,USD,True,careers@ellisdab.com,**Ellis and Company** is on a mission to trans...,,,,,,,,,,,
23,glassdoor,"<a href=""https://www.glassdoor.com/job-listing...",,High Ticket Closer,Five Star Body Transformations,,,2023-10-03,yearly,77421.0,212304.0,USD,True,,We're looking for our next High Ticket Closer ...,,,,,,,,,,,
