In [None]:
%load_ext autoreload
%autoreload 2

In [1]:

## IMPORT NECESSARY LIBRARIES
import pandas as pd
import csv

import json
import datetime

# Scraping google jobs w/ serpapi
import serpapi

# generate UULE code from adress
import uule_grabber

# connect to SQLite database
import sqlite3

from sqlalchemy import create_engine
from jobsearch.database import export_dataframe_to_postgresql
from jobsearch.utils import convert_dict_columns_to_json
from jobsearch.params import *

In [3]:
# Connect to SQLite and create a new database (or open it if it already exists)
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

try:
    # get canonical name for location of interest ==> FRANCE
    cursor.execute("""
        SELECT "Canonical Name"
        FROM google_geotargets
        WHERE "Target Type" = ? AND "Country Code" = ?;
    """, (GOOGLE_GEOTARGET_TARGET_TYPE, GOOGLE_GEOTARGET_COUNTRY_CODE))

    canonical_name = cursor.fetchall()[0][0]

finally:
    conn.close()

In [4]:
# convert canonical_name to uule code
uule_code = uule_grabber.uule(canonical_name)
uule_code

'w+CAIQICIGRnJhbmNl'

In [5]:
# dataframe with all scraped jobs during session
new_jobs = pd.DataFrame()
n_range = 50

In [6]:
for query in SERPAPI_SEARCH_QUERIES:

    for num  in range(n_range):

        start_page = num * 10

    # define parameters
        params = {
            'api_key': SERPAPI_KEY,
            'device':'desktop',
            'uule': uule_code,                         # encoded location
            'q': query,                          # search query
            'google_domain': 'google.fr',
            'hl': 'fr',                                 # language of the search
            'gl': 'fr',                                 # country of the search
            'engine': 'google_jobs',                    # SerpApi search engine
            'start': start_page,  # pagination
            #'as_qdr':"d"
            #"tbs":"cdr:1,cd_min:12-25-2023,cd_max:01-01-2024"
            'chips': f'date_posted:today'  #'date_range:2023-05-18'   #'date_posted:today'
        }

        # query serapi
        search = serpapi.search(params=params)
        # get results as dict
        res = search.as_dict()

        # check if last search page, exceptions handling
        try:
            if res['error'] == "Google hasn't returned any results for this query.":
                    break
        except KeyError:
                print(f"Getting SerpAPI data for page: {start_page} - {start_page+10} of '{query}' results")
        else:
                continue

        # discard search metadata, keep job results
        jobs = res['jobs_results']

        # convert to dataframe
        jobs_df = pd.DataFrame(jobs)
        # convert json columns to dataframe
        normalized_extensions = pd.json_normalize(jobs_df['detected_extensions'])

        ten_jobs_df = pd.concat([jobs_df, normalized_extensions],axis=1).drop('detected_extensions', axis=1)
        ten_jobs_df['date_time'] = datetime.datetime.now()
        ten_jobs_df['search_query'] = query

        # concat dataframe of 10 pulled results with new_jobs
        new_jobs = pd.concat([new_jobs, ten_jobs_df])

new_jobs = new_jobs.drop_duplicates(subset='description')

new_jobs = new_jobs.reindex(columns=['title', 'company_name', 'location', 'via', 'description',
    'job_highlights', 'related_links', 'thumbnail', 'extensions', 'job_id',
    'posted_at', 'schedule_type', 'date_time', 'search_query'])

new_jobs = new_jobs.reset_index(drop=True)

print("Scraping jobs finished ✅")

print(f"{new_jobs.shape[0]} jobs were scraped")


Getting SerpAPI data for page: 0 - 10 of 'machine learning engineer' results
Getting SerpAPI data for page: 10 - 20 of 'machine learning engineer' results
Getting SerpAPI data for page: 20 - 30 of 'machine learning engineer' results
Getting SerpAPI data for page: 30 - 40 of 'machine learning engineer' results
Getting SerpAPI data for page: 40 - 50 of 'machine learning engineer' results
Getting SerpAPI data for page: 50 - 60 of 'machine learning engineer' results
Getting SerpAPI data for page: 60 - 70 of 'machine learning engineer' results
Getting SerpAPI data for page: 70 - 80 of 'machine learning engineer' results
Getting SerpAPI data for page: 80 - 90 of 'machine learning engineer' results
Getting SerpAPI data for page: 90 - 100 of 'machine learning engineer' results
Getting SerpAPI data for page: 100 - 110 of 'machine learning engineer' results
Getting SerpAPI data for page: 110 - 120 of 'machine learning engineer' results
Getting SerpAPI data for page: 120 - 130 of 'machine learnin

KeyboardInterrupt: 

In [8]:

jobs = convert_dict_columns_to_json(new_jobs, ["related_links", "job_highlights"])

In [9]:
export_dataframe_to_postgresql(jobs, export_to_cloud=True)
export_dataframe_to_postgresql(jobs, export_to_cloud=False)

Using CLOUD postgres database ...
✅ Dataframe exported to database
Using LOCAL postgres database ...
✅ Dataframe exported to database


In [10]:
new_jobs

Unnamed: 0,title,company_name,location,via,description,job_highlights,related_links,thumbnail,extensions,job_id,posted_at,schedule_type,date_time,search_query
0,Machine Learning Engineer Intern (4-6 months),360Learning,Paris,via LinkedIn,"In 2021, 360Learning assembled its first team ...","[{""items"": [""In 2021, 360Learning assembled it...","[{""link"": ""https://360learning.com/"", ""text"": ...",https://encrypted-tbn0.gstatic.com/images?q=tb...,"[il y a 23 heures, Stage]",eyJqb2JfdGl0bGUiOiJNYWNoaW5lIExlYXJuaW5nIEVuZ2...,il y a 23 heures,Stage,2024-02-13 08:59:38.374182,machine learning engineer
1,Machine Learning Engineer,Pictarine,Labège,via LinkedIn,Mission and challenges 🎯\n\nL’un des enjeux de...,"[{""items"": [""Mission and challenges \ud83c\udf...","[{""link"": ""https://www.google.fr/search?sca_es...",https://encrypted-tbn0.gstatic.com/images?q=tb...,"[il y a 23 heures, À plein temps]",eyJqb2JfdGl0bGUiOiJNYWNoaW5lIExlYXJuaW5nIEVuZ2...,il y a 23 heures,À plein temps,2024-02-13 08:59:38.374182,machine learning engineer
2,Machine learning engineer | retail/ia,Urban Linker,Paris,via Urban Linker,Tes missions seront :\n• Assurer le bon foncti...,"[{""items"": [""Tes missions seront :\n\u2022 Ass...","[{""link"": ""https://www.google.fr/search?sca_es...",,"[50 k € à 60 k € par an, Prestataire]",eyJqb2JfdGl0bGUiOiJNYWNoaW5lIGxlYXJuaW5nIGVuZ2...,,Prestataire,2024-02-13 08:59:38.374182,machine learning engineer
3,Remote Machine Learning Engineer,Clarity AI,France,via Emplois Trabajo.org,Clarity AI is a global tech company founded in...,"[{""items"": [""Clarity AI is a global tech compa...","[{""link"": ""http://clarity.ai/"", ""text"": ""clari...",https://encrypted-tbn0.gstatic.com/images?q=tb...,"[il y a 2 jours, À plein temps]",eyJqb2JfdGl0bGUiOiJSZW1vdGUgTWFjaGluZSBMZWFybm...,il y a 2 jours,À plein temps,2024-02-13 08:59:38.374182,machine learning engineer
4,Data Scientist / Machine Learning Engineer,Ekinox,Paris,via Welcome To The Jungle,Le poste: Data Scientist/Machine Learning Engi...,"[{""items"": [""Le poste: Data Scientist/Machine ...","[{""link"": ""https://www.google.fr/search?sca_es...",https://encrypted-tbn0.gstatic.com/images?q=tb...,"[il y a 29 jours, À plein temps]",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCAvIE1hY2...,il y a 29 jours,À plein temps,2024-02-13 08:59:38.374182,machine learning engineer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,Data Scientist R&D H/F,Manpower,Tours,via HelloWork,Détail du poste\n\nLe cabinet de recrutement M...,"[{""items"": [""D\u00e9tail du poste\n\nLe cabine...","[{""link"": ""https://www.google.fr/search?sca_es...",https://encrypted-tbn0.gstatic.com/images?q=tb...,"[il y a 4 jours, À plein temps et Travail temp...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBSXHUwMD...,il y a 4 jours,À plein temps et Travail temporaire,2024-02-13 09:03:54.250849,data scientist
6,Data Scientist Trainee H/F,FR03 VALEO VISION,France,via Workday,Valeo je technologická společnost vyvíjející p...,"[{""items"": [""Valeo je technologick\u00e1 spole...","[{""link"": ""https://www.google.fr/search?sca_es...",,"[il y a 4 jours, À plein temps et Travail temp...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBUcmFpbm...,il y a 4 jours,À plein temps et Travail temporaire,2024-02-13 09:03:54.250849,data scientist
7,Lyon - Data Scientist Confirmé(e) - H/F,ALTEN,Lyon,via Smart Recruiters Jobs,Description de l'entreprise\n\nRejoindre LINCO...,"[{""items"": [""Description de l'entreprise\n\nRe...","[{""link"": ""https://www.alten.fr/"", ""text"": ""al...",,[À plein temps],eyJqb2JfdGl0bGUiOiJMeW9uIC0gRGF0YSBTY2llbnRpc3...,,À plein temps,2024-02-13 09:03:54.250849,data scientist
8,"Informatiker/in, Data Scientist",HUK-COBURG Versicherungsgruppe,Lyon,via BeBee,"Machine Learning, Künstliche Intelligenz, Data...","[{""items"": [""Machine Learning, K\u00fcnstliche...","[{""link"": ""http://www.huk.de/"", ""text"": ""huk.d...",,"[il y a 2 jours, À plein temps]",eyJqb2JfdGl0bGUiOiJJbmZvcm1hdGlrZXIvaW4sIERhdG...,il y a 2 jours,À plein temps,2024-02-13 09:03:54.250849,data scientist
