# Business Goal

Scraping LinkedIn for specific job postings, e.g the position of "Data Scientist", enables businesses to gather real-time data on available job positions, providing insights into industry trends, required skills, and competitor hiring patterns.

This information aids in talent acquisition strategies, market analysis, and optimizing recruitment efforts by identifying job demand, skill preferences, and potential talent pools within the industry.


# Save offline HTML content


In [28]:
# Importing necessary modules
from selenium import webdriver  # Module for browser automation
from selenium.webdriver.common.by import By  # Module for locating elements by different strategies
from selenium.webdriver.common.keys import Keys  # Module for keyboard keys simulation
from bs4 import BeautifulSoup  # Module for HTML parsing
import math # Module for mathematical functions (lambda x)
import time # Module for time-related functions (sleep time for web scrapping to prevent from account blockage)
import pandas as pd

In [29]:
driver = webdriver.Chrome()#executable_path=r'C:\Users\amosw\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe')

# Open a webpage
url='https://www.linkedin.com/login/it?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin'
driver.get(url)
driver.find_element(By.NAME, value='session_key').send_keys('email@email.com') # + Keys.ENTER) #change email
driver.find_element(By.NAME, value='session_password').send_keys('Password' + Keys.ENTER) #change email
job_url='https://www.linkedin.com/jobs/search/?keywords=Data%20Scientist&location=Barcellona&origin=JOBS_HOME_LOCATION_SUGGESTION&refresh=false'
driver.get(job_url)

In [30]:
import re

# Define a dunction to extract the number of search results from parsed HTML content
def number_results(soup):

    # Class name used to identify elements containing job search result count
    class_name_search='display-flex t-normal t-12 t-black--light jobs-search-results-list__text'

    # Loop through all <small> HTML elements within parsed HTML content
    for object_html in soup.find_all('small'):
        try:

            # Get class name of the current <small> element
            class_name=' '.join(object_html['class'])

            # Check if class name matches the specified search class
            if class_name_search==class_name:

                # Find <span> elements within the identified <small> element
                for span in object_html.find_all('span'):

                    # Apply regular expression to extract digits from text
                    result = re.search(r'\d+', span.text.replace(',', ''))

                    return int(''.join(result.group())) # Convert extracted digits to an integer and return
        except:
            next  # Skip errors without taking specific action

In [31]:
# Parse the HTML content of the current page using BeautifulSoup
soup=BeautifulSoup(driver.page_source,'html.parser')

# Call the 'number_results' function to extract and then print the total number of search results
total_results=number_results(soup)
print(total_results)

#we limit the search to 50 results for time reasons 
total_results=50 # delete or comment this line for complete results

1613


In [32]:
# Define a function to save parsed HTML content to a file
def save_html(soup, page):

    # Open a file in write mode to save the parsed HTML content
    with open('html_jobs/html_job_listing_page' + str(page) + '.html', 'w', encoding='utf-8') as f: # saves it in the html_jobs folder in this directory

        # Write the prettified HTML content to the file
        f.write(soup.prettify())

    # Print a message indicating that HTML content has been saved
    print('HTML saved')

    return

In [33]:
from selenium.webdriver.support.ui import WebDriverWait # Module for explicit waits
from selenium.webdriver.support import expected_conditions as EC # Module for expected conditions

# Loop through job search results by increments of 25 job postings per page
for i in range(0, total_results, 25):
    job_url = f'https://www.linkedin.com/jobs/search/?keywords=Data%20Scientist&location=Barcellona&origin=JOBS_HOME_LOCATION_SUGGESTION&refresh=false&start={i}'

    # Navigate to the constructed job search URL
    driver.get(job_url)

    # Use WebDriverWait instead of time.sleep for more precise waiting and set up a WebDriverWait object with a timeout of 10 seconds
    wait = WebDriverWait(driver, 10)

    # Wait for the job cards to be present using CSS selector
    elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'li.ember-view.jobs-search-results__job-card-search--generic-occludable-area')))

    # Print the job card elements found
    print(elements)


    # Iterate through each job card element
    for element in elements:

        # Scroll to the element
        driver.execute_script("arguments[0].scrollIntoView();", element)
        # You might consider waiting for specific elements to load here

    time.sleep(5)  # This sleep might still be needed, depending on your use case

    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Save HTML or perform other parsing activities by calling the 'save_html' function
    save_html(soup, int((i/25)+1))

[<selenium.webdriver.remote.webelement.WebElement (session="f990f8db6b059c96e372d99eb50c53e9", element="6A99F64140469F932ACBBAA1CCABD8C4_element_153")>, <selenium.webdriver.remote.webelement.WebElement (session="f990f8db6b059c96e372d99eb50c53e9", element="6A99F64140469F932ACBBAA1CCABD8C4_element_154")>, <selenium.webdriver.remote.webelement.WebElement (session="f990f8db6b059c96e372d99eb50c53e9", element="6A99F64140469F932ACBBAA1CCABD8C4_element_155")>, <selenium.webdriver.remote.webelement.WebElement (session="f990f8db6b059c96e372d99eb50c53e9", element="6A99F64140469F932ACBBAA1CCABD8C4_element_156")>, <selenium.webdriver.remote.webelement.WebElement (session="f990f8db6b059c96e372d99eb50c53e9", element="6A99F64140469F932ACBBAA1CCABD8C4_element_157")>, <selenium.webdriver.remote.webelement.WebElement (session="f990f8db6b059c96e372d99eb50c53e9", element="6A99F64140469F932ACBBAA1CCABD8C4_element_158")>, <selenium.webdriver.remote.webelement.WebElement (session="f990f8db6b059c96e372d99eb50c

In [34]:
# Define a function to extract job IDs from parsed HTML content
def job_ids(soup):

    # Define class names used to identify job listing elements
    class_name_search='ember-view jobs-search-results__list-item occludable-update p0 relative scaffold-layout__list-item'
    class_name_search_hidden='ember-view jobs-search-results__job-card-search--generic-occludable-area jobs-search-results__list-item occludable-update p0 relative scaffold-layout__list-item'

    # Initialize an empty list to store job IDs
    job_ids=[]

    # Loop through all <li> HTML elements within parsed HTML content
    for object_html in soup.find_all('li'):

        try:

            # Get class names of the current <li> element

            class_name=' '.join(object_html['class'])
            
            if class_name_search==class_name or class_name_search_hidden==class_name:
                
                job_ids.append(int(object_html['data-occludable-job-id']))

                
        except:
            next
    return job_ids

In [35]:
# Import necessary modules
from bs4 import BeautifulSoup
import os

# Define the folder path containing HTML files
folder_path = 'html_jobs'

# Get a list of files in the specified folder
file_list = os.listdir(folder_path)

# Initialize an empty list to store job IDs
job_id_list = []

# Iterate through each file in the folder
for file_name in file_list:

    # Open each HTML file in read mode
    with open(folder_path+'/'+file_name,  'r', encoding='utf-8') as html_file:

        # Parse the HTML content of the file using BeautifulSoup
        soup=BeautifulSoup(html_file,'html.parser')

        # Extract job IDs from the parsed HTML content using the 'job_ids' function
        job_id_list = job_id_list+job_ids(soup) # Append extracted job IDs to the list


In [36]:
# Print the count of unique job IDs obtained from the list
print(len(list(set(job_id_list))))

50


In [37]:
# Function to save individual parsed HTML content to a file
def save_single_html(soup, job_id):

    # Open a file in write mode to save the parsed HTML content
    with open('html_single_job/html_job_' + str(job_id) + '.html', 'w', encoding='utf-8') as f:

        # Write the prettified HTML content to the file
        f.write(soup.prettify())

    # Print a message indicating that HTML content has been saved
    print('HTML saved')

    return

In [38]:
# Define a limit x for time and security reasons
x = 25
job_id_list=list(set(job_id_list))[:x] # limit to x

In [39]:
# Import necessary modules and functions from Selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

 # Initialize an empty list to store job IDs to drop
job_id_to_drop=[]

# Iterate through each job ID in the list
for job_id in job_id_list:
    try:

        # Construct the job URL using the job ID
        job_url='https://www.linkedin.com/jobs/view/' + str(job_id)

        # Navigate to the constructed job URL
        driver.get(job_url)

        # Set up a WebDriverWait object with a timeout of 10 seconds
        wait = WebDriverWait(driver, 10)

        # Wait for specific elements (job cards) to be present using CSS selector
        elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.ember-view.link-without-visited-state.inline-block.t-black')))

        # Additional sleep (can be adjusted) to ensure page content loading
        time.sleep(5)

        # Parse the HTML content of the page using BeautifulSoup
        soup=BeautifulSoup(driver.page_source,'html.parser')

        # Save the parsed HTML content using the 'save_single_html' function with the current job ID
        save_single_html(soup, job_id)

    # Append job ID to 'job_id_to_drop' list if there's an exception/error
    except:
        job_id_to_drop.append(job_id)

HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved
HTML saved


In [40]:
# Calculate the count of job IDs that encountered errors or exceptions and are therefore in the list job_id_to_drop
len(job_id_to_drop)

0

# Scraping offline


In [41]:
# Function to scrape job listing information from parsed HTML content
def scraping_job_listing(soup):

  # Initialize an empty list to store job data
  job_data=[]

  # Loop through all <li> HTML elements with a specific class
  for object_html in soup.find_all('li', class_='ember-view jobs-search-results__list-item occludable-update p0 relative scaffold-layout__list-item'):

    # Extract job ID
    job_id=int(object_html['data-occludable-job-id'])

    # Check if the job is promoted and set "promoted" accordingly
    promoted=object_html.find('li', class_='job-card-container__footer-item inline-flex align-items-center')
    if promoted:
      promoted=True
    else:
      promoted=False

    # Extract the job title and clean formatting
    job_title=object_html.find('a', class_='disabled ember-view job-card-container__link job-card-list__title').text
    job_title=job_title.strip()

    # Extract company name and clean formatting
    company=object_html.find('span', class_='job-card-container__primary-description').text
    company=company.strip()

    # Extract job location and clean formatting
    location=object_html.find('li', class_='job-card-container__metadata-item').text
    location=location.strip()

    # Process location and workspace
    if location[-1]==')':
      workspace=location.split('(')[1][:-1] # Split the location
      location=location.split('(')[0][:-1]
    else:
      workspace='Other'

    # Determine job state (e.g., actively recruiting, early applicant, etc.)
    state=object_html.find('div', class_='job-card-container__job-insight-text')
    if state:
      state=state.text.strip()
      if state=='Actively recruiting': #to finish Early applicants
        state='On-going'
      elif state=='Early applicant':
        state='Early Applications'
      else:
        state='Others'
    else:
      state='Others'

    # Construct the URL for the job offer
    offer_url='https://www.linkedin.com/jobs/view/'+str(job_id)

    # Append job data to the list
    job_data.append([job_id, job_title, company, location, workspace, state, offer_url, promoted])
  return job_data #pd.DataFrame(data=job_data, columns=['Job ID', 'Job Title', 'Company Name', 'Location', 'Workspace', 'State', 'Offer URL', 'Promoted'])





In [42]:
from bs4 import BeautifulSoup
import os

# Define the folder path containing HTML files
folder_path = 'html_jobs'

# Get a list of files in the specified folder
file_list = os.listdir(folder_path)

# Initialize an empty list to store job data
job_data = []

# Iterate through each file in the folder
for file_name in file_list:

    # Open each HTML file in read mode
    with open(folder_path+'/'+file_name,  'r', encoding='utf-8') as html_file:

        # Parse the HTML content of the file using BeautifulSoup
        soup=BeautifulSoup(html_file,'html.parser')

        # Extract job listing information from parsed HTML content using the 'scraping_job_listing' function
        job_data = job_data +scraping_job_listing(soup) # Append extracted job data to the list

In [43]:
# Create a Pandas DataFrame using the 'job_data' list with specified column names
df=pd.DataFrame(data=job_data, columns=['Job ID', 'Job Title', 'Company Name', 'Location', 'Workspace', 'State', 'Offer URL', 'Promoted'])

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Job ID,Job Title,Company Name,Location,Workspace,State,Offer URL,Promoted
0,3690425439,Ruby on Rails Developer,Stream Hatchet,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3690425439,True
1,3736663947,"Lead Data Scientist (Bangkok based, relocation...",Agoda,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3736663947,True
2,3772484774,"Infrastructure & Cloud Operations Engineer, (H...",Joppy,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3772484774,True
3,3742882559,Senior Data Scientist - Marketplace (They/She/He),Glovo,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3742882559,True
4,3770337197,Risk Decision Scientist (They/She/He),Glovo,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3770337197,False


In [44]:
# Display information about the DataFrame's columns and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Job ID        50 non-null     int64 
 1   Job Title     50 non-null     object
 2   Company Name  50 non-null     object
 3   Location      50 non-null     object
 4   Workspace     50 non-null     object
 5   State         50 non-null     object
 6   Offer URL     50 non-null     object
 7   Promoted      50 non-null     bool  
dtypes: bool(1), int64(1), object(6)
memory usage: 2.9+ KB


In [45]:
# Remove duplicate rows from the DataFrame
df=df.drop_duplicates()

# Display information about the DataFrame's columns and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Job ID        50 non-null     int64 
 1   Job Title     50 non-null     object
 2   Company Name  50 non-null     object
 3   Location      50 non-null     object
 4   Workspace     50 non-null     object
 5   State         50 non-null     object
 6   Offer URL     50 non-null     object
 7   Promoted      50 non-null     bool  
dtypes: bool(1), int64(1), object(6)
memory usage: 2.9+ KB


In [46]:
# Filter out rows where 'Job ID' is present in 'job_id_to_drop'
df = df[~df['Job ID'].isin(job_id_to_drop)]

# Display information about the DataFrame's columns and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Job ID        50 non-null     int64 
 1   Job Title     50 non-null     object
 2   Company Name  50 non-null     object
 3   Location      50 non-null     object
 4   Workspace     50 non-null     object
 5   State         50 non-null     object
 6   Offer URL     50 non-null     object
 7   Promoted      50 non-null     bool  
dtypes: bool(1), int64(1), object(6)
memory usage: 2.9+ KB


In [47]:
# Filter the DataFrame to include rows where 'Job ID' is present in 'job_id_list' and display information
df = df[df['Job ID'].isin(job_id_list)]

# Display information about the DataFrame's columns and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25 entries, 1 to 49
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Job ID        25 non-null     int64 
 1   Job Title     25 non-null     object
 2   Company Name  25 non-null     object
 3   Location      25 non-null     object
 4   Workspace     25 non-null     object
 5   State         25 non-null     object
 6   Offer URL     25 non-null     object
 7   Promoted      25 non-null     bool  
dtypes: bool(1), int64(1), object(6)
memory usage: 1.6+ KB


In [48]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Job ID,Job Title,Company Name,Location,Workspace,State,Offer URL,Promoted
1,3736663947,"Lead Data Scientist (Bangkok based, relocation...",Agoda,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3736663947,True
2,3772484774,"Infrastructure & Cloud Operations Engineer, (H...",Joppy,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3772484774,True
4,3770337197,Risk Decision Scientist (They/She/He),Glovo,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3770337197,False
8,3761871874,Senior Data Scientist,Capgemini Engineering,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3761871874,True
10,3757333127,"Data Scientist (Mid-level, Senior) | Candy Cru...",King,"Barcelona, Catalonia, Spain",Other,Others,https://www.linkedin.com/jobs/view/3757333127,False


In [49]:
# Define a function to extract job details based on job ID
def scraping_job(job_id):

  # Print the job ID for reference
  print(job_id)

  # Path to folder containing individual job HTML files
  folder_path = 'html_single_job/'

  # Construct the filename for the specific job HTML file
  job_single='html_job_' + str(job_id) + '.html'

  # Attempt to open the job HTML file and parse its content using BeautifulSoup
  try:
    with open(folder_path+job_single,  'r', encoding='utf-8') as html_file:
      soup_job=BeautifulSoup(html_file,'html.parser')

  # Return None for all details if file access or parsing fails
  except:
    return [job_id, None, None, None, None, None, None, None, None, None]

  # Extracting various job details based on the parsed HTML content
  skills=soup_job.find('a', class_='app-aware-link job-details-how-you-match__skills-item-subtitle t-14 overflow-hidden').text.replace(' ', '').replace('and', '').strip().split(',')
  python_required='Python' in skills

  # Extracting 'Seniority' information
  seniority=soup_job.find_all('span', class_='job-details-jobs-unified-top-card__job-insight-view-model-secondary')

  # If multiple 'seniority' elements are found, take the text of the second element
  if seniority and len(seniority)>1:
    seniority=soup_job.find_all('span', class_='job-details-jobs-unified-top-card__job-insight-view-model-secondary')[1].text.strip()

  # If 'seniority' information is not found, assign None
  else:
    seniority=None

  # Print extracted 'Seniority' information
  print(seniority)

  # Extracting 'Posting Date' information
  posting_date=soup_job.find('span', class_='tvm__text tvm__text--neutral').find_all('span')[-1].text.strip()

  # Print extracted 'Posting Date'
  print(posting_date)

  # Extracting 'Number of Applicants' information
  try:
    number_applicants=int(soup_job.find_all('span', class_='tvm__text tvm__text--neutral')[-1].text.strip().split(' ')[0])

    # Print extracted 'Number of Applicants'
    print(number_applicants)

  except:
    try:
      number_applicants=int(soup_job.find('span', class_='tvm__text tvm__text--positive').find('strong').text.strip().split(' ')[0])
      print(number_applicants)

    # If 'Number of Applicants' information is not found, assign None
    except:
      number_applicants=None

  # Extracting 'Employment Type' information
  try:
    employment_type=soup_job.find('span', class_='job-details-jobs-unified-top-card__job-insight-view-model-secondary').text.strip()

    # Print extracted 'Employment Type'
    print(employment_type)

  except:
    employment_type=soup_job.find('li', class_='job-details-jobs-unified-top-card__job-insight').find('span').find('span').text.strip()

  # Extracting 'Industry' and 'Employees' information
  industry_employees=soup_job.find_all('li', class_='job-details-jobs-unified-top-card__job-insight')[1].find('span').text.strip().split(' · ')
  try:
    industry=industry_employees[1]
    employees=industry_employees[0]

  # If 'Industry' and/or 'Employees' information is not found, assign None
  except:
    employees=industry_employees[0]
    industry=None

  # Print extracted 'Industry' and extracted 'Employees'
  print(industry)
  print(employees)

  linkedin=True if soup_job.find('button', class_='jobs-apply-button artdeco-button artdeco-button--3 artdeco-button--primary ember-view') else False#.find('span').text.strip()=='Easy Apply'
  
  print(linkedin)


  # Attempt to extract the number of followers and format the string to an integer
  try:
    followers=int(soup_job.find('div', class_='artdeco-entity-lockup__subtitle ember-view t-16').text.strip().split(' ')[0].replace(',', ''))
    print(followers)

  # If 'Followers' information is not found or extraction fails, assign None
  except:
    followers=None

  # Return a list containing extracted job details
  return [job_id, python_required, seniority, posting_date, number_applicants, employment_type, industry, employees, linkedin, followers]

In [50]:
# create a new column in DataFrame
# by applying a lambda function to the first 25 elements of the 'Job ID' column
new_columns = df['Job ID'][:25].apply(lambda x: scraping_job(x))

3736663947
Mid-Senior level
2 weeks ago
9
Full-time
Software Development
5,001-10,000 employees
False
519604
3772484774


None
1 week ago
21
Full-time
None
1-10 employees
False
8022
3770337197
Entry level
2 weeks ago
29
Full-time
Consumer Services
1,001-5,000 employees
False
248107
3761871874
None
3 weeks ago
76
Full-time
IT Services and IT Consulting
10,001+ employees
True
1988677
3757333127
None
1 week ago
64
Entertainment Providers
1,001-5,000 employees
False
168175
3761478277
Mid-Senior level
3 weeks ago
37
Full-time
Information Technology & Services
201-500 employees
False
23119
3765095612
Associate
3 weeks ago
116
Full-time
IT Services and IT Consulting
11-50 employees
True
244
3688372389
Mid-Senior level
2 weeks ago
136
Full-time
Advertising Services
1,001-5,000 employees
False
207496
3772496286
None
1 week ago
214
Full-time
None
51-200 employees
True
4811
3750808229
Mid-Senior level
1 month ago
92
Full-time
IT Services and IT Consulting
10,001+ employees
False
5056139
3694929706
Entry level
1 week ago
262
Full-time
Entertainment Providers
1,001-5,000 employees
False
129987
3745692455
Entry level
2

In [51]:
print(new_columns)

1     [3736663947, False, Mid-Senior level, 2 weeks ...
2     [3772484774, False, None, 1 week ago, 21, Full...
4     [3770337197, False, Entry level, 2 weeks ago, ...
8     [3761871874, True, None, 3 weeks ago, 76, Full...
10    [3757333127, False, None, 1 week ago, 64, Full...
11    [3761478277, True, Mid-Senior level, 3 weeks a...
12    [3765095612, True, Associate, 3 weeks ago, 116...
13    [3688372389, False, Mid-Senior level, 2 weeks ...
15    [3772496286, True, None, 1 week ago, 214, Full...
16    [3750808229, False, Mid-Senior level, 1 month ...
24    [3694929706, False, Entry level, 1 week ago, 2...
27    [3745692455, False, Entry level, 2 weeks ago, ...
31    [3769281201, False, Associate, 1 week ago, 716...
32    [3737223750, False, Mid-Senior level, 2 weeks ...
33    [3768785578, True, Mid-Senior level, 1 week ag...
34    [3763879318, False, Associate, 1 week ago, 43,...
36    [3773544884, False, Mid-Senior level, 5 days a...
37    [3493569296, True, Associate, 2 weeks ago,

In [52]:

# Create a new DataFrame from a list of columns
new_columns_df = pd.DataFrame(new_columns.tolist(), columns=['Job ID', 'Python Required', 'Seniority', 'Posting Date', 'Number of Applicants', 'Employment Type', 'Industry', 'Employees', 'LinkedIn Easy Apply', 'Company Followers'])

# Display the first few rows
new_columns_df.head()

Unnamed: 0,Job ID,Python Required,Seniority,Posting Date,Number of Applicants,Employment Type,Industry,Employees,LinkedIn Easy Apply,Company Followers
0,3736663947,False,Mid-Senior level,2 weeks ago,9,Full-time,Software Development,"5,001-10,000 employees",False,519604
1,3772484774,False,,1 week ago,21,Full-time,,1-10 employees,False,8022
2,3770337197,False,Entry level,2 weeks ago,29,Full-time,Consumer Services,"1,001-5,000 employees",False,248107
3,3761871874,True,,3 weeks ago,76,Full-time,IT Services and IT Consulting,"10,001+ employees",True,1988677
4,3757333127,False,,1 week ago,64,Full-time,Entertainment Providers,"1,001-5,000 employees",False,168175


In [53]:
# Merge the original DataFrame 'df' with 'new_columns_df' based on 'Job ID'
df = pd.merge(df, new_columns_df, how='left', on='Job ID')

# Display the first 25 rows
df.head(25)

Unnamed: 0,Job ID,Job Title,Company Name,Location,Workspace,State,Offer URL,Promoted,Python Required,Seniority,Posting Date,Number of Applicants,Employment Type,Industry,Employees,LinkedIn Easy Apply,Company Followers
0,3736663947,"Lead Data Scientist (Bangkok based, relocation...",Agoda,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3736663947,True,False,Mid-Senior level,2 weeks ago,9,Full-time,Software Development,"5,001-10,000 employees",False,519604
1,3772484774,"Infrastructure & Cloud Operations Engineer, (H...",Joppy,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3772484774,True,False,,1 week ago,21,Full-time,,1-10 employees,False,8022
2,3770337197,Risk Decision Scientist (They/She/He),Glovo,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3770337197,False,False,Entry level,2 weeks ago,29,Full-time,Consumer Services,"1,001-5,000 employees",False,248107
3,3761871874,Senior Data Scientist,Capgemini Engineering,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3761871874,True,True,,3 weeks ago,76,Full-time,IT Services and IT Consulting,"10,001+ employees",True,1988677
4,3757333127,"Data Scientist (Mid-level, Senior) | Candy Cru...",King,"Barcelona, Catalonia, Spain",Other,Others,https://www.linkedin.com/jobs/view/3757333127,False,False,,1 week ago,64,Full-time,Entertainment Providers,"1,001-5,000 employees",False,168175
5,3761478277,Senior Data Scientist,Heetch,Greater Barcelona Metropolitan Area,Hybrid,Others,https://www.linkedin.com/jobs/view/3761478277,True,True,Mid-Senior level,3 weeks ago,37,Full-time,Information Technology & Services,201-500 employees,False,23119
6,3765095612,Data Engineer,ALPHABET CONSULTING,Greater Barcelona Metropolitan Area,Hybrid,Others,https://www.linkedin.com/jobs/view/3765095612,True,True,Associate,3 weeks ago,116,Full-time,IT Services and IT Consulting,11-50 employees,True,244
7,3688372389,Sr. Data Scientist,Criteo,"Barcelona, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3688372389,True,False,Mid-Senior level,2 weeks ago,136,Full-time,Advertising Services,"1,001-5,000 employees",False,207496
8,3772496286,Junior PySpark Data Engineer,Mática Partners,Greater Barcelona Metropolitan Area,Hybrid,Others,https://www.linkedin.com/jobs/view/3772496286,True,True,,1 week ago,214,Full-time,,51-200 employees,True,4811
9,3750808229,Data Scientist,HP,"Sant Cugat del Vallès, Catalonia, Spain",Hybrid,Others,https://www.linkedin.com/jobs/view/3750808229,False,False,Mid-Senior level,1 month ago,92,Full-time,IT Services and IT Consulting,"10,001+ employees",False,5056139


In [54]:
df.to_csv('data.csv', index=False) # save dataframe