In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
## IMPORT NECESSARY LIBRARIES
import pandas as pd
import csv

import json
import datetime

# Scraping google jobs w/ serpapi
import serpapi

# generate UULE code from adress
import uule_grabber

# connect to SQLite database
import sqlite3

# necessary for path to files
#from config import DB_PATH

In [19]:
API_KEY="4b799b64af09be918f6d66d6e908184cba836c46596e58bfa8bf1fb9280e7f09"
SEARCH_QUERIES = ["machine learning engineer", "data scientist", "data analyst", "data engineer"]
COUNTRY_CODE = 'FR'
TARGET_TYPE = 'Country'
DB_PATH = "../db/jobs_database.db"

In [23]:
# Connect to SQLite and create a new database (or open it if it already exists)
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

try:
    # get canonical name for location of interest ==> FRANCE
    cursor.execute("""
        SELECT "Canonical Name"
        FROM google_geotargets
        WHERE "Target Type" = ? AND "Country Code" = ?;
    """, (TARGET_TYPE, COUNTRY_CODE))

    canonical_name = cursor.fetchall()[0][0]

finally:
    conn.close()

In [25]:
# convert canonical_name to uule code
uule_code = uule_grabber.uule(canonical_name)
uule_code

'w+CAIQICIGRnJhbmNl'

In [42]:
# dataframe with all scraped jobs during session
all_jobs = pd.DataFrame()

In [66]:
for query in SEARCH_QUERIES[:1]:

    for num  in range(2):

        start_page = num * 10

    # define parameters
        params = {
            'api_key': API_KEY,
            'device':'desktop',
            'uule': uule_code,                         # encoded location
            'q': query,                          # search query
            'google_domain': 'google.fr',
            'hl': 'fr',                                 # language of the search
            'gl': 'fr',                                 # country of the search
            'engine': 'google_jobs',                    # SerpApi search engine
            'start': start_page,                             # pagination
            'chips': 'date_posted:today'  #'date_range:2023-05-18'   #'date_posted:today'
        }

        # query serapi
        search = serpapi.search(params=params)
        # get results as dict
        res = search.as_dict()

        # check if last search page, exceptions handling
        try:
            if res['error'] == "Google hasn't returned any results for this query.":
                    break
        except KeyError:
                print(f"Getting SerpAPI data for page: {start_page} - {start_page+10} of '{query}' results")
        else:
                continue

        # discard search metadata, keep job results
        jobs = res['jobs_results']

        # convert to dataframe
        jobs_df = pd.DataFrame(jobs)
        # convert json columns to dataframe
        normalized_extensions = pd.json_normalize(jobs_df['detected_extensions'])

        ten_jobs_df = pd.concat([jobs_df, normalized_extensions],axis=1).drop('detected_extensions', axis=1)
        ten_jobs_df['date_time'] = datetime.datetime.now()

        # concat dataframe of 10 pulled results with all_jobs
        if start_page == 0:
                all_jobs = ten_jobs_df
        else:
                all_jobs = pd.concat([all_jobs, ten_jobs_df])

        all_jobs['search_query'] = query

        all_jobs = all_jobs.drop_duplicates(subset='description')

        all_jobs = all_jobs.reindex(columns=['title', 'company_name', 'location', 'via', 'description',
            'job_highlights', 'related_links', 'thumbnail', 'extensions', 'job_id',
            'posted_at', 'schedule_type', 'date_time', 'search_query'])

        all_jobs = all_jobs.reset_index(drop=True)


Getting SerpAPI data for page: 0 - 10 of 'machine learning engineer' results
Getting SerpAPI data for page: 10 - 20 of 'machine learning engineer' results


In [67]:
#### EXPORT TO SQLITE DATABASE ####
# convert value to str format (sql database doesn't accept list type)
for column in all_jobs.columns:
    all_jobs[column] = all_jobs[column].apply(lambda x: str(x) if isinstance(x, list) else x)

# export data to database
with sqlite3.connect(DB_PATH) as conn:
    all_jobs.to_sql('unprocessed_data', conn, if_exists='append', index=False)

In [68]:
all_jobs

Unnamed: 0,title,company_name,location,via,description,job_highlights,related_links,thumbnail,extensions,job_id,posted_at,schedule_type,date_time,search_query
0,Data Science / Machine Learning Engineer Intern,Accor,Boulogne-Billancourt,via Stage.fr,Description de l''entreprise\n\nBienvenue chez...,"[{'items': [""Description de l''entreprise\n\nB...","[{'link': 'http://group.accor.com/', 'text': '...",https://encrypted-tbn0.gstatic.com/images?q=tb...,"['il y a 23 heures', 'À plein temps et Stage']",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVuY2UgLyBNYWNoaW...,il y a 23 heures,À plein temps et Stage,2023-12-31 12:57:17.605627,machine learning engineer
1,Junior Machine Learning Engineer Paris,Jaylo,Paris,via Emplois Trabajo.org,"At Jaylo, we're on the lookout for an enthusia...","[{'items': [""At Jaylo, we're on the lookout fo...",[{'link': 'https://www.google.fr/search?sca_es...,https://encrypted-tbn0.gstatic.com/images?q=tb...,"['il y a 13 heures', 'À plein temps']",eyJqb2JfdGl0bGUiOiJKdW5pb3IgTWFjaGluZSBMZWFybm...,il y a 13 heures,À plein temps,2023-12-31 12:57:17.605627,machine learning engineer
2,AI/ML Engineer,AMD,Paris,via BeBee,The Role\n\nAMD is looking for a senior softwa...,"[{'items': [""The Role\n\nAMD is looking for a ...","[{'link': 'http://www.amd.com/', 'text': 'amd....",https://encrypted-tbn0.gstatic.com/images?q=tb...,"['il y a 9 heures', 'À plein temps']",eyJqb2JfdGl0bGUiOiJBSS9NTCBFbmdpbmVlciIsImh0aW...,il y a 9 heures,À plein temps,2023-12-31 12:57:17.605627,machine learning engineer
3,Ingénieur Machine Learning OPS,Externis Resourcing,Neuilly-sur-Seine,via LinkedIn,Un Éditeur Software dans l’univers BigData/Pré...,[{'items': ['Un Éditeur Software dans l’univer...,"[{'link': 'http://externisresourcing.com/', 't...",https://encrypted-tbn0.gstatic.com/images?q=tb...,"['il y a 16 heures', 'À plein temps']",eyJqb2JfdGl0bGUiOiJJbmfDqW5pZXVyIE1hY2hpbmUgTG...,il y a 16 heures,À plein temps,2023-12-31 12:57:17.605627,machine learning engineer
4,Ingénieur en apprentissage,Novencia,Lyon,via BeBee,Novencia accompagne ses clients dans leurs pro...,"[{'items': [""Novencia accompagne ses clients d...",[{'link': 'https://www.google.fr/search?sca_es...,https://encrypted-tbn0.gstatic.com/images?q=tb...,"['il y a 9 heures', 'À plein temps']",eyJqb2JfdGl0bGUiOiJJbmfDqW5pZXVyIGVuIGFwcHJlbn...,il y a 9 heures,À plein temps,2023-12-31 12:57:17.605627,machine learning engineer
5,Data Engineer (It) / Freelance,Cbc Consult,Paris,via Sercanto,CBC Consult recrute pour le compte de son clie...,[{'items': ['CBC Consult recrute pour le compt...,[{'link': 'https://www.google.fr/search?sca_es...,https://encrypted-tbn0.gstatic.com/images?q=tb...,"['il y a 12 heures', 'Travail temporaire']",eyJqb2JfdGl0bGUiOiJEYXRhIEVuZ2luZWVyIChJdCkgLy...,il y a 12 heures,Travail temporaire,2023-12-31 12:57:17.605627,machine learning engineer
6,Healthanea - Data Engineer,AXA Group,Paris (+ 1 autre),via Careers At AXA,"Within GETD, we have launched in 2020 a strate...","[{'items': ['Within GETD, we have launched in ...","[{'link': 'http://www.axa.com/', 'text': 'axa....",https://encrypted-tbn0.gstatic.com/images?q=tb...,"['il y a 12 heures', 'À plein temps']",eyJqb2JfdGl0bGUiOiJIZWFsdGhhbmVhIC0gRGF0YSBFbm...,il y a 12 heures,À plein temps,2023-12-31 12:57:17.605627,machine learning engineer
7,Consultant Data Engineer,Magellan Partners,Grenoble,via BeBee,Ça vous dirait un aperçu du futur de votre car...,"[{'items': [""Ça vous dirait un aperçu du futur...","[{'link': 'http://www.magellan-partners.eu/', ...",https://encrypted-tbn0.gstatic.com/images?q=tb...,"['il y a 9 heures', 'À plein temps']",eyJqb2JfdGl0bGUiOiJDb25zdWx0YW50IERhdGEgRW5naW...,il y a 9 heures,À plein temps,2023-12-31 12:57:17.605627,machine learning engineer
8,Ingénieur Machine Learning et Intelligence Art...,Urban Linker,Paris,via BeBee,Ton objectif : construire la meilleure solutio...,"[{'items': [""Ton objectif : construire la meil...","[{'link': 'http://www.urbanlinker.com/', 'text...",,"['il y a 9 heures', 'À plein temps']",eyJqb2JfdGl0bGUiOiJJbmfDqW5pZXVyIE1hY2hpbmUgTG...,il y a 9 heures,À plein temps,2023-12-31 12:57:17.605627,machine learning engineer
9,Data Engineer,emagine Consulting,Lille,via BeBee,emagine recherche un(e) Data Engineer à Lille ...,"[{'items': [""emagine recherche un(e) Data Engi...",[{'link': 'https://www.google.fr/search?sca_es...,,"['il y a 9 heures', 'À plein temps']",eyJqb2JfdGl0bGUiOiJEYXRhIEVuZ2luZWVyIiwiaHRpZG...,il y a 9 heures,À plein temps,2023-12-31 12:57:17.605627,machine learning engineer
