## PROJET WEB SCRAPING AVEC UTILISATION DE STREAMLIT

Importation des modules neccesaaires

In [50]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
from io import BytesIO
import pandas as pd
import os
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

Configuration du Driver

In [52]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
os.environ["PATH"] += r"C:\\Program Files (x86)"
driver.get('https://www.timesjobs.com/')
driver.implicitly_wait(5)

Un screenshot du site 

In [53]:
screenshot = driver.get_screenshot_as_png()
image = Image.open(BytesIO(screenshot))
fig = px.imshow(image)
fig.show()

Donner un skill ou un job en input

In [54]:
job_skill = input("Donner un skill ou un job: ")

Stockage les données dans un dictionnaire après scraping

In [55]:

dict_jobs={
    "Job Title":[],
    "Company":[],
    "Description":[],
    "Skills":[],
    "Localization":[],
    "Link":[],
    "Date":[]
}

for i in range(1,11):
    jobs, companies, descriptions, skills, localizations, dates=[], [], [], [], [], []

        
        
    try:
        
        driver.get(f'https://www.timesjobs.com/candidate/job-search.html?from=submit&luceneResultSize=25&txtKeywords={job_skill.replace(" ", "+")}&postWeek=60&searchType=personalizedSearch&actualTxtKeywords={job_skill.replace(" ", "+")}&searchBy=0&rdoOperator=OR&pDate=I&sequence={i}&startPage=1')
    except:
        break
            
    try:      
        driver.find_element(by=By.XPATH, value='//*[@id="closeSpanId"]').click()
    except NoSuchElementException:
        pass
    jobs.extend(driver.find_elements(by=By.XPATH, value='//*[@id="searchResultData"]/ul//li/header/h2/a'))
    companies.extend(driver.find_elements(by=By.XPATH, value='//*[@id="searchResultData"]/ul/li/header/h3'))
    descriptions.extend(driver.find_elements(by=By.XPATH, value='//*[@id="searchResultData"]/ul/li/ul[2]/li[1]'))
    skills.extend(driver.find_elements(by=By.XPATH, value='//*[@id="searchResultData"]/ul/li/ul[2]/li[2]/span'))
    localizations.extend(driver.find_elements(by=By.XPATH, value='//*[@id="searchResultData"]/ul/li/ul[1]/li[last()]/span'))
    dates.extend(driver.find_elements(by=By.XPATH, value='//*[@id="searchResultData"]/ul/li/div/div/div/span[@class="sim-posted"]/span[last()]'))

        
    dict_jobs["Job Title"].extend(job.text.capitalize() for job in jobs)
    dict_jobs["Company"].extend(company.text for company in companies)
    dict_jobs["Description"].extend(description.text[16:] for description in descriptions)
    dict_jobs["Skills"].extend(skill.text for skill in skills)
    dict_jobs["Localization"].extend(localization.text for localization in localizations)
    dict_jobs["Link"].extend(job.get_attribute("href") for job in jobs)
    dict_jobs["Date"].extend(date.text for date in dates)
        
    
    

Mise en place du Dataframe

In [56]:
df_jobs = pd.DataFrame(dict_jobs)
df_jobs.head()

Unnamed: 0,Job Title,Company,Description,Skills,Localization,Link,Date
0,Hadoop admin,PNR Software Solutions Private Limited,Experience 2 years to 7 yearsLocation Mumbai ...,"database , hadoop , load balancing",Mumbai,https://www.timesjobs.com/job-detail/hadoop-ad...,Posted few days ago
1,Hadoop engineer,FMT CORPORATION LIMITED,Hadoop Engineer5 years of industry experience...,"hive , python , java , apache , oozie , hdfs ,...",,https://www.timesjobs.com/job-detail/hadoop-en...,Posted few days ago
2,Hadoop developer,Axis Aerospace & Technolgies Private,Full-timeJob DescriptionThe ideal candidate s...,"java , hadoop , create low level design",Chennai,https://www.timesjobs.com/job-detail/hadoop-de...,Posted few days ago
3,Hadoop specialist,ITCAN Pte Ltd,Requirement:10 consecutive years in the softw...,"hive , python , scala , elastic search , hdfs ...",Singapore,https://www.timesjobs.com/job-detail/hadoop-sp...,Posted few days ago
4,Hadoop lead,AugmatrixGo,# :- Overall 7 years of IT experience in a va...,"hive , bi , data warehousing , pig , docker , ...",Gurgaon,https://www.timesjobs.com/job-detail/hadoop-le...,Posted few days ago


Trouver l'occurence des competences demandées

In [57]:
def occurrence_skill(my_serie):
   my_dict = {}
   skill_list = []
   for my_string in my_serie:
      skill_list += my_string.split(" , ")
   for sk in skill_list:
      if sk in my_dict.keys():
         my_dict[sk]+=1
      else:
         my_dict[sk]=1
   return my_dict

In [58]:
skills_dict = occurrence_skill(df_jobs["Skills"])
print(skills_dict)


{'database': 47, 'hadoop': 98, 'load balancing': 1, 'hive': 70, 'python': 107, 'java': 96, 'apache': 31, 'oozie': 11, 'hdfs': 21, 'hbase': 28, 'sql': 90, 'pig': 18, 'create low level design': 1, 'scala': 52, 'elastic search': 4, 'software engineering': 12, 'big data': 79, 'bi': 23, 'data warehousing': 26, 'docker': 18, 'unix scripting': 1, 'linux': 24, 'rest': 13, 'rdbms': 21, 'nosql': 46, 'mapreduce': 24, 'cassandra': 29, 'sqoop': 13, 'test engineer': 1, 'selenium': 1, 'automation testing': 1, 'cucumber': 1, 'solution architect': 7, 'hadoop development': 1, 'nifi': 1, 'spark': 6, 'python scripting': 1, 'data modeling': 7, 'hadoop developer': 4, 'infrastructure': 41, 'software packages': 2, 'forms': 2, 'hibernate': 8, 'java ee': 2, 'junit': 3, 'oracle': 15, 'jdbc': 1, 'impala': 13, 'advanced java': 3, 'j2ee technologies': 3, 'senior software engineer': 4, 'ajax': 2, 'spring': 10, 'xml': 7, 'json': 9, 'html': 7, ', big data': 26, 'problem solving': 16, 'team player': 12, 'javascript': 2

Plot des skills les plus populaires

In [59]:
skills_dict = dict(sorted(occurrence_skill(df_jobs["Skills"]).items(), key=lambda item: item[1], reverse=True))
try:
    skills_dict.pop(".")
except:
    pass
skill_keys = list(skills_dict.keys())[:20]
skill_vals = list(skills_dict.values())[:20]
skill_df = pd.DataFrame({"keys": skill_keys, "values": skill_vals})
fig = px.histogram(data_frame=skill_df, x="values", y="keys")
fig.show()

Plot des localisations les plus frequentes

In [60]:
loc_dict = df_jobs["Localization"].value_counts().to_dict()
loc_keys = list(loc_dict.keys())[:20]
loc_vals = list(loc_dict.values())[:20]
loc_df = pd.DataFrame({"keys": loc_keys, "values": loc_vals})
fig = px.histogram(data_frame=loc_df, x="values", y="keys")
fig.show()

Plot des compagnies qui offrent plus d'emplois

In [61]:
comp_dict = df_jobs["Company"].value_counts().to_dict()
comp_keys = list(comp_dict.keys())[:20]
comp_vals = list(comp_dict.values())[:20]
comp_df = pd.DataFrame({"keys": comp_keys, "values": comp_vals})
fig = px.histogram(data_frame=comp_df, x="values", y="keys")
fig.show()

Plot des jobs les plus demandées

In [62]:
job_dict = df_jobs["Job Title"].value_counts().to_dict()
job_keys = list(job_dict.keys())[:20]
job_vals = list(job_dict.values())[:20]
job_df = pd.DataFrame({"Job Title": job_keys, "Nbr of job offer": job_vals})
fig = px.histogram(data_frame=job_df, x="Nbr of job offer", y="Job Title", template="ggplot2")
fig.show()