In [1]:
# importing the requisite libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime

# list to append the extracted data from the website

job_list = []

def get_webpage_data(job_title, location, page):
    
    """ The below function iterates through the webpage and extracts the
    - Job job title based on input
    - Post data
    - Company Name
    - Job location
    - Job summary
    - Job url link
    - Company ratings and 
    - Salary"""

    # generating a header to avoid being locked out of site
    # calling url and formatting so as to call any job or location
    # creating a soup to better read html
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'}
    url = f'https://fr.indeed.com/jobs?q={job_title}&l={location}&start={page}'
    response = requests.get(url, headers, timeout=5)
    soup = BeautifulSoup((response.content), 'lxml')
    
    
    # This is the highest level tag available ot all the parameters of interest and allows to return None 
    # when missing a value for a parameter
    jobs = soup.select('div.jobsearch-SerpJobCard')
    
    for job in jobs:


        job_title = job.find('h2').text.strip('\nnouveau')
        post_date = [i.text.replace('\xa0', ' ') for i in job.select('span.date')]
        company_name = [i.text.strip() for i in job.select('span.company')]
        job_location = job.find('div', 'recJobLoc')['data-rc-loc']
        job_summary = [i.text.strip() for i in job.select('div.summary')]
        job_url = ['https://fr.indeed.com' + i['href'] for i in soup.select('h2>a[href]')]
        
        company_rates = []
        salary = []

        r = job.find('span', 'ratingsContent')
        if r:
            company_rates.append(float(r.text.strip().replace(',', '.')))
        else:
            company_rates.append(None)

        sal = job.find('span', 'salaryText')
        if sal:
            salary.append(sal.text.strip('\n').replace('\xa0', ''))
        else:
            salary.append(None)
        
        
        # generating a dictionary for parameters and data pulled from site
        job_listings = {'JobTitle':job_title, 'NumDaysPosted':post_date, 'CompanyName':company_name, \
                        'JobLocation':job_location,'JobSummary':job_summary,  'JobUrl':job_url, \
                        'CompanyRating':company_rates, 'JobSalary':salary}
        
        # appending job_listings dictionary to list for dataframe creation
        job_list.append(job_listings)  

    return

In [2]:
# Iterating through the various webpages to pull 200 job listings
for i in range(0, 170, 10):
    get_webpage_data('data', 'Île-de-France', i)
    
#generating a dataframe from pulled data
df = pd.DataFrame(job_list)
type(df)

pandas.core.frame.DataFrame

---

In [3]:
#cleaning dataframe as needed
df.shape

(255, 8)

In [4]:
df.head(2)

Unnamed: 0,JobTitle,NumDaysPosted,CompanyName,JobLocation,JobSummary,JobUrl,CompanyRating,JobSalary
0,Data Engineer H/F (CDI),[il y a 24 jours],[Leaseplan France],Rueil-Malmaison (92),[Data engineer h/f (cdi) ou similaire: 2 ans (...,[https://fr.indeed.com/pagead/clk?mo=r&ad=-6NY...,[3.9],[38000 € - 45000 € par an]
1,Manager Data Analyst,[il y a 2 jours],[CAPENCY],Enghien-les-Bains (95),"[Créée en 2007, CAPENCY (anciennement CAP ADRE...",[https://fr.indeed.com/pagead/clk?mo=r&ad=-6NY...,[None],[40000 € - 45000 € par an]


In [5]:
# converting list to string
df = df.applymap(lambda x:x if not isinstance(x, list) else x[0] if len(x) else '')

In [6]:
df.head(2)

Unnamed: 0,JobTitle,NumDaysPosted,CompanyName,JobLocation,JobSummary,JobUrl,CompanyRating,JobSalary
0,Data Engineer H/F (CDI),il y a 24 jours,Leaseplan France,Rueil-Malmaison (92),Data engineer h/f (cdi) ou similaire: 2 ans (S...,https://fr.indeed.com/pagead/clk?mo=r&ad=-6NYl...,3.9,38000 € - 45000 € par an
1,Manager Data Analyst,il y a 2 jours,CAPENCY,Enghien-les-Bains (95),"Créée en 2007, CAPENCY (anciennement CAP ADRES...",https://fr.indeed.com/pagead/clk?mo=r&ad=-6NYl...,,40000 € - 45000 € par an


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   JobTitle       255 non-null    object 
 1   NumDaysPosted  255 non-null    object 
 2   CompanyName    255 non-null    object 
 3   JobLocation    255 non-null    object 
 4   JobSummary     255 non-null    object 
 5   JobUrl         255 non-null    object 
 6   CompanyRating  154 non-null    float64
 7   JobSalary      47 non-null     object 
dtypes: float64(1), object(7)
memory usage: 16.1+ KB


In [8]:
# Stripping and spliting columns as needed for better analysis

df['NumDaysPosted'] = df.NumDaysPosted.str.strip('il y a jours').replace({'Il y a plus de 30':30, "Aujourd'hui":1, "Aujourd'h":1, "Publiée à l'instant":1})
df['NumDaysPosted'] = df.NumDaysPosted.apply(pd.to_numeric)
df[['LowSalaryRange','UpperSalaryRange']] = df['JobSalary'].str.split('-',expand=True)
df[['UpperSalaryRange','SalaryPeriod(per)']] = df['UpperSalaryRange'].str.split('par',expand=True)
df[['LowSalaryRange','SalaryPeriod(per)']] = df['LowSalaryRange'].str.split('par',expand=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   JobTitle           255 non-null    object 
 1   NumDaysPosted      255 non-null    int64  
 2   CompanyName        255 non-null    object 
 3   JobLocation        255 non-null    object 
 4   JobSummary         255 non-null    object 
 5   JobUrl             255 non-null    object 
 6   CompanyRating      154 non-null    float64
 7   JobSalary          47 non-null     object 
 8   LowSalaryRange     47 non-null     object 
 9   UpperSalaryRange   39 non-null     object 
 10  SalaryPeriod(per)  8 non-null      object 
dtypes: float64(1), int64(1), object(9)
memory usage: 22.0+ KB


In [10]:
# Drop columns not needed

df = df.drop(columns='JobSalary')

In [11]:
# Adding today's date for reference
df['TodayDate'] = pd.to_datetime('today').strftime('%d-%m-%Y')
df['TimeFrame'] = 'jours' 

# reorganizing data columns
df = df[['JobTitle', 'NumDaysPosted', 'TimeFrame', 'TodayDate', 'CompanyName', 'JobLocation', \
         'LowSalaryRange','UpperSalaryRange','SalaryPeriod(per)', 'CompanyRating','JobSummary', 'JobUrl']]

In [12]:
# sorting the dataset by number of days of posted and dropping the duplicates keeping the most recent publication

df = df.sort_values('NumDaysPosted', ascending=False)
df = df.drop_duplicates(subset=['JobTitle', 'CompanyName', 'JobSummary','JobLocation'], keep='first')
df.shape

(235, 12)

---

In [13]:
# Exporting Data to csv

In [14]:
df.to_csv('/Users/laetitia/IronHack/data-ft-par-labs/Projects/Week-3/output/indeed_jobs.csv', sep='\t', index=False)

---

In [15]:
# Exporting Data to sql
import pymysql
from sqlalchemy import create_engine
from getpass import getpass

In [16]:
username = 'root'
server = 'localhost'
database = 'indeed'
password = getpass()

········


In [17]:
engine=create_engine(f'mysql+pymysql://{username}:{password}@{server}/{database}')

In [18]:
df.to_sql('data_jobs', engine, if_exists='replace', index=False)