In [31]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import csv

from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
import asyncio
import time

INITIATE PROXY POOL

In [32]:
proxy_pool = {
  'proxy:port': 0,
  'proxy:port': 0,
}

CODE TEST PROXY

In [33]:
# # define custom options for the Selenium driver
# options = Options()

# options.add_argument(f'--proxy-server={proxy_pool[7]}')

# # create the ChromeDriver instance with custom options
# driver = webdriver.Chrome(options=options)

# driver.get('http://httpbin.org/ip')
# # driver.get('https://www.malt.fr/profile/luccharlopeau')

DF GET USER TO FETCH

In [34]:
def get_next_profile(df):
    """
    get the next profile to scrap
    
    Parameters:
    df (pd.DataFrame): the dataframe with the profiles to scrap
    """
    if df['scraped'].all():
        print('All profiles have been scraped')
        return None, None
    
    next_profile = df[df['scraped']==False].iloc[0] # get the next profile to scrap
    # update the df
    df.loc[df['profil']==next_profile['profil'], 'scraped'] = True
    return df, next_profile

ADD TO DATAFRAME FUNCTION AND SAVE

In [35]:

df_raw = pd.DataFrame(columns=['name', 'profile_image', 'headline', 'experience', 'price', 'response_rate', 'response_time', 'categories', 'competences', 'supermalter', 'location','presentation', 'recommendations', 'missions', 'teletravail_preference', 'profil', 'link', 'creation_date'])

index = 0
def add_to_df(data, proxy_address): # save the data in a global df
    """
    Add the user data to the global DataFrame df_raw and save it to a CSV file every 1000 users.
    
    Parameters
    ----------
    data : dict
        A dictionary containing the user data.
    """
    
    global df_raw, index
    
    if data is None:
        proxy_pool[proxy_address] = 0
        index += 1
        return
    
    user_df = pd.DataFrame([data])  # Convert the user data to a DataFrame
    
    # Check if the user DataFrame has the same columns as df_raw
    if user_df.columns.tolist() != df_raw.columns.tolist():
        # If the columns don't match, ensure they align and reorder columns accordingly
        user_df = user_df.reindex(columns=df_raw.columns)
    
    # Append the user DataFrame to df_raw
    df_raw = pd.concat([df_raw, user_df], ignore_index=True)
    index += 1
    time = pd.Timestamp.now() # get the current time
    print(f'Scraped {index} users, at {time} last one is: {data}')
    
    # free the proxy in the pool
    proxy_pool[proxy_address] = 0
    
    # Save the DataFrame to a CSV file every 1000 users
    if index % 1000 == 0:
        df_raw.to_csv('scraped_data.csv', index=False)

SCRAP USER FUNCTION

In [36]:
async def scrap_user(row, driver, proxy_address, retry=0):
    """ 
    Scrap the user data from the given row and add it to the global DataFrame df_raw.
    
    Parameters
    ----------
    row : pandas.Series
        A row from the DataFrame df.
    driver : selenium.webdriver.chrome.webdriver.WebDriver
        The Selenium driver used to scrap the user data.
    """
    
    try:    
        data = {}
        
        wait = WebDriverWait(driver, 5)
        driver.get(row['link'])
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
    
        # get price
        price_element = soup.find('div', {'data-testid': 'profile-price'})
        if price_element: data['price'] = price_element.find('span', class_='block-list__price').text.strip()
        
        if not data['price']:
            driver.quit()
            print('WARNING couldnt fetch the page, retrying')
            return await scrap_user(row, driver, proxy_address)
            
        # get experience
        experience_element = soup.find('span', string='Expérience')
        if experience_element: data['experience'] = experience_element.find_next('span', class_='profile-indicators-content').text.strip()
            
        # get link of profile image
        profile_image_element = soup.find('div', class_='joy-avatar joy-avatar__large joy-avatar__secondary')
        if profile_image_element: data['profile_image'] = profile_image_element.find('img')['src']
        else: data['profile_image'] = 'No profile image'
        
        # get response rate
        response_rate_element = soup.find('span', string='Taux de réponse')
        if response_rate_element: data['response_rate'] = response_rate_element.find_next('span', class_='profile-indicators-content').text.strip()
        
        # get response time
        response_time_element = soup.find('span', string='Temps de réponse')
        if response_time_element: data['response_time'] = response_time_element.find_next('span', class_='profile-indicators-content').text.strip()
        
        # get name
        name_element = soup.find('div', {'data-testid': 'profile-fullname'})
        if name_element: data['name'] = name_element.text.strip()
            
        # get headline
        headline_element = soup.find('div', {'data-testid': 'profile-headline'})
        if headline_element: data['headline'] = headline_element.text.strip()

        # get number of missions
        missions_element = soup.find('div', {'data-testid': 'profile-counter-missions'})
        if missions_element: data['missions'] = missions_element.find('strong').text.strip()
        
        # Récupérer toutes les catégories
        categories_elements = soup.find_all('li', {'class': 'categories__list-item'})
        data['categories'] = [category.find('a').text for category in categories_elements]
        
        # Récupérer les compétences
        competences_element = soup.find_all('div', {'class': 'profile-expertises__content-list-item__label'})
        data['competences'] = [competence.find('a', class_='joy-link joy-link_teal').text.strip() for competence in competences_element]
        
        # get supermalter status
        supermalter_element = soup.find('span', class_='joy-badge-level__tag blue')
        if supermalter_element: data['supermalter'] = supermalter_element.get_text(strip=True)
            
        # get location
        location_element = soup.find('dl', {'class': 'profile__location-and-workplace-preferences__item'})
        if location_element:
            location_label = location_element.find('dt', {'data-testid': 'profile-location-address-label'})
            location_value = location_element.find('dd', {'data-testid': 'profile-location-preference-address'})

            if location_label and location_value:
                location = {location_label.text: location_value.text}
                data['location'] = location
                
        # get remote work preference
        teletravail_element = soup.find('dl', {'class': 'profile-page-mission-preferences__item'})
        if teletravail_element:
            teletravail_label = teletravail_element.find('dt')
            teletravail_value = teletravail_element.find('dd')

            if teletravail_label and teletravail_value:
                teletravail_preference = {teletravail_label.text: teletravail_value.text}
                data['teletravail_preference'] = teletravail_preference
                
        # Récupérer le nombre de recommandations
        recommendations_element = soup.find('span', {'data-testid': 'profile-counter-recommendations'})
        if recommendations_element:
            recommendations_count = int(recommendations_element.text.split()[0])
            data['recommendations'] = recommendations_count    

        # Récupérer le message de présentation
        presentation_element = soup.find('div', {'class': 'profile-description__content'})
        if presentation_element: data['presentation'] = presentation_element.get_text(strip=True)
            
        # add link of the profile
        data['link'] = row['link']
        
        # add created date
        data['creation_date'] = row['creation_date']
        
        # add name to the data
        data['profil'] = row['profil']
            
        driver.quit() # close the browser
        
        add_to_df(data, proxy_address) # add the data to the global df
        
    except:
        driver.quit()
        if retry < 3:
            retry += 1
            return await scrap_user(row, driver, proxy_address, retry)
        return add_to_df(data, proxy_address)

In [37]:
import threading

def configure_webdriver(proxy_address):
    """
    Configure the Selenium driver with the given proxy address.
    
    Parameters
    ----------
    proxy_address : str
        The proxy address to use for the Selenium driver.
    """
    
    # define custom options for the Selenium driver
    options = Options()

    # options.add_argument(f'--proxy-server={proxy_address}')
    options.add_argument("window-size=800,400")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("start-maximized")
    options.add_argument("enable-automation")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-dev-shm-usage")

    # create the ChromeDriver instance with custom options
    # driver = webdriver.Remote("http://192.168.0.250:4444/wd/hub", options=options) # for docker
    driver = webdriver.Chrome(options=options) # for local
    
    return driver

async def scrap_all_users_proxy(proxy_addresses, df):
    """
    Scrap all the users from the given DataFrame df using the given proxy addresses.
    
    Parameters
    ----------
    proxy_addresses : list
        A list of proxy addresses to use for the Selenium driver.
    df : pandas.DataFrame
        A DataFrame containing the users to scrap.
    """
    
    global index # To keep track of the progress
    
    while df[df['scraped'] == False].shape[0] > 0:  # While there are profiles to scrap
        
        if len(proxy_addresses) > df[df['scraped'] == False].shape[0]: # If there are more proxies than profiles to scrap (to avoid length mismatch), proxy_addresses is a dict
            break
        
        # for proxy_address in proxy_addresses:
        # for every proxy address(key) in the proxy pool dict, get the value: if the value is 0, it means that the proxy is not used, so we can use it
        for proxy_address in proxy_pool:
            if proxy_pool[proxy_address] == 0:
                proxy_pool[proxy_address] = 1 # set the proxy to 1, meaning that it is used
        
                driver = configure_webdriver(proxy_address)
                df, row = get_next_profile(df)

                # Create a thread for each scraping task
                thread = threading.Thread(target=asyncio.run, args=(scrap_user(row, driver, proxy_address),))
                thread.start()  # Start the thread

RUN THE SCRIPT WITH THE LINKS

In [38]:
profile_links = pd.read_csv('../../data/profile_links.csv') # read the csv file containing the profile links
profile_links['profil'] = profile_links['profil'].apply(lambda x: x.replace('https://www.malt.fr/profile/', '')) 

# add column link to the DataFrame
profile_links['link'] = profile_links['profil'].apply(lambda x: f'https://www.malt.fr/profile/{x}')

profile_links['scraped'] = False # add column scraped to the DataFrame

# get first 100 rows of the DataFrame
df = profile_links.iloc[:50]

def main():
    return scrap_all_users_proxy(proxy_pool, df)

await main()

df_raw

Scraped 1 users, at 2023-12-07 17:19:19.940716 last one is: {'price': '300\xa0€', 'experience': '15 ans et +', 'profile_image': 'https://dam.malt.com/dlyrpdvs4fxbsoslod17?gravity=face&func=face&face_margin=70&w=440&h=440&force_format=webp', 'response_rate': '100%', 'response_time': '12h', 'name': 'Brice Tillet', 'headline': 'Composer / Sound Designer / Music Producer', 'categories': ['Sound Designer'], 'competences': [], 'location': {'Localisation': 'Paris, France'}, 'recommendations': 6, 'presentation': "Bonjour,je suis musicien, compositeur et sound designer depuis 2005.J'aime composer à l'image, travailler les textures sonores, jouer avec le rythme et les silences.N'hésitez pas à me contacter !Brice", 'link': 'https://www.malt.fr/profile/bricetillet', 'creation_date': '2014-01-08', 'profil': 'bricetillet'}
Scraped 2 users, at 2023-12-07 17:19:20.244127 last one is: {}
Scraped 3 users, at 2023-12-07 17:19:20.410704 last one is: {'price': '700\xa0€', 'experience': '-', 'profile_image'

Unnamed: 0,name,profile_image,headline,experience,price,response_rate,response_time,categories,competences,supermalter,location,presentation,recommendations,missions,teletravail_preference,profil,link,creation_date
0,Brice Tillet,https://dam.malt.com/dlyrpdvs4fxbsoslod17?grav...,Composer / Sound Designer / Music Producer,15 ans et +,300 €,100%,12h,[Sound Designer],[],,"{'Localisation': 'Paris, France'}","Bonjour,je suis musicien, compositeur et sound...",6.0,,,bricetillet,https://www.malt.fr/profile/bricetillet,2014-01-08
1,,,,,,,,,,,,,,,,,,
2,Alexandre Labayle,No profile image,Consultant décisionnel Senior,-,700 €,100%,1h,[],[],,{'Localisation': ''},"Consultant Senior en Business Intelligence , 1...",,,,alexandrelabayle,https://www.malt.fr/profile/alexandrelabayle,2014-02-04
3,Nacera TIZI,https://dam.malt.com/erokb7omnnhmf79bojnr?grav...,Commerciale Freelance,-,40 €,,1h,[Business developers],[],,"{'Localisation': 'Tanger, Tanger-Tétouan, Maroc'}",Parce que La ville de Tanger tend à devenir l'...,,,,naceratizi,https://www.malt.fr/profile/naceratizi,2014-02-22
4,Youri Galescot,https://dam.malt.com/r3nb6gaf6723bi1unhhx?grav...,Développeur web frontend et backend,-,235 €,33%,12h,[],[],,"{'Localisation': 'Paris, France'}","Bonjour,Je suis développeur frontend et backen...",,,,yourigalescot,https://www.malt.fr/profile/yourigalescot,2014-02-18
5,Aurélien D.,https://dam.malt.com/vabndz3c9dkhsimtvgww?grav...,"SysOps, DevOps et développeur web",8-15 ans,550 €,100%,1h,"[Administrateur base de données, Administrateu...",[],,"{'Localisation': 'La Rochelle, France'}",SysOps (unix) depuis 5 ans et DevOps depuis 4 ...,1.0,12.0,,aureliendazy,https://www.malt.fr/profile/aureliendazy,2013-06-06
6,Siobhan Engelmann,https://dam.malt.com/jqj9umzodlrrxhdy5nye?grav...,Traduction Translation,-,300 €,100%,Quelques jours,[Traducteur],[],,"{'Localisation': 'Metz, France'}","English native speaker, degree qualified engin...",,,,siobhanengelmann,https://www.malt.fr/profile/siobhanengelmann,2014-02-11
7,Julien Trezeux,https://dam.malt.com/pllhwpaapz84yfqmc7rw?grav...,directeur artistique - graphiste - illustrateur,8-15 ans,350 €,100%,1h,"[Photographe, Graphiste, Illustrateur]",[],,"{'Localisation': 'Paris, France / www.graphicf...","Graphiste et directeur artistique depuis 2007,...",3.0,,,julientrezeux,https://www.malt.fr/profile/julientrezeux,2014-02-12
8,Valérie Vanhamme-Vermeulen,https://dam.malt.com/a07wv7z58xjc9zjbfkyj?grav...,Consultante marketing & communication spécialisée,-,695 €,100%,1h,[Consultant Communication],[],,{'Localisation': '32800 Réans'},Je possède une expérience de plus de 13 ans da...,,,,valerievanhammevermeulen,https://www.malt.fr/profile/valerievanhammever...,2014-02-22
9,José Thermique,https://dam.malt.com/p6afuz3bhpd4xrvbbcpu?grav...,Aménagement de salle de bain,-,400 €,,1h,[],[],,{'Localisation': ''},"Bonjour je suis plombier-chauffagiste, spécial...",,,,josethermique,https://www.malt.fr/profile/josethermique,2014-02-19


SAVE THE DATA TO CSV

In [39]:
df_raw

Unnamed: 0,name,profile_image,headline,experience,price,response_rate,response_time,categories,competences,supermalter,location,presentation,recommendations,missions,teletravail_preference,profil,link,creation_date
0,Brice Tillet,https://dam.malt.com/dlyrpdvs4fxbsoslod17?grav...,Composer / Sound Designer / Music Producer,15 ans et +,300 €,100%,12h,[Sound Designer],[],,"{'Localisation': 'Paris, France'}","Bonjour,je suis musicien, compositeur et sound...",6.0,,,bricetillet,https://www.malt.fr/profile/bricetillet,2014-01-08
1,,,,,,,,,,,,,,,,,,
2,Alexandre Labayle,No profile image,Consultant décisionnel Senior,-,700 €,100%,1h,[],[],,{'Localisation': ''},"Consultant Senior en Business Intelligence , 1...",,,,alexandrelabayle,https://www.malt.fr/profile/alexandrelabayle,2014-02-04
3,Nacera TIZI,https://dam.malt.com/erokb7omnnhmf79bojnr?grav...,Commerciale Freelance,-,40 €,,1h,[Business developers],[],,"{'Localisation': 'Tanger, Tanger-Tétouan, Maroc'}",Parce que La ville de Tanger tend à devenir l'...,,,,naceratizi,https://www.malt.fr/profile/naceratizi,2014-02-22
4,Youri Galescot,https://dam.malt.com/r3nb6gaf6723bi1unhhx?grav...,Développeur web frontend et backend,-,235 €,33%,12h,[],[],,"{'Localisation': 'Paris, France'}","Bonjour,Je suis développeur frontend et backen...",,,,yourigalescot,https://www.malt.fr/profile/yourigalescot,2014-02-18
5,Aurélien D.,https://dam.malt.com/vabndz3c9dkhsimtvgww?grav...,"SysOps, DevOps et développeur web",8-15 ans,550 €,100%,1h,"[Administrateur base de données, Administrateu...",[],,"{'Localisation': 'La Rochelle, France'}",SysOps (unix) depuis 5 ans et DevOps depuis 4 ...,1.0,12.0,,aureliendazy,https://www.malt.fr/profile/aureliendazy,2013-06-06
6,Siobhan Engelmann,https://dam.malt.com/jqj9umzodlrrxhdy5nye?grav...,Traduction Translation,-,300 €,100%,Quelques jours,[Traducteur],[],,"{'Localisation': 'Metz, France'}","English native speaker, degree qualified engin...",,,,siobhanengelmann,https://www.malt.fr/profile/siobhanengelmann,2014-02-11
7,Julien Trezeux,https://dam.malt.com/pllhwpaapz84yfqmc7rw?grav...,directeur artistique - graphiste - illustrateur,8-15 ans,350 €,100%,1h,"[Photographe, Graphiste, Illustrateur]",[],,"{'Localisation': 'Paris, France / www.graphicf...","Graphiste et directeur artistique depuis 2007,...",3.0,,,julientrezeux,https://www.malt.fr/profile/julientrezeux,2014-02-12
8,Valérie Vanhamme-Vermeulen,https://dam.malt.com/a07wv7z58xjc9zjbfkyj?grav...,Consultante marketing & communication spécialisée,-,695 €,100%,1h,[Consultant Communication],[],,{'Localisation': '32800 Réans'},Je possède une expérience de plus de 13 ans da...,,,,valerievanhammevermeulen,https://www.malt.fr/profile/valerievanhammever...,2014-02-22
9,José Thermique,https://dam.malt.com/p6afuz3bhpd4xrvbbcpu?grav...,Aménagement de salle de bain,-,400 €,,1h,[],[],,{'Localisation': ''},"Bonjour je suis plombier-chauffagiste, spécial...",,,,josethermique,https://www.malt.fr/profile/josethermique,2014-02-19


In [40]:
df_raw.to_csv('df_raw_final.csv', index=False)

Scraped 48 users, at 2023-12-07 17:20:10.554806 last one is: {'price': '170\xa0€', 'experience': '-', 'profile_image': 'No profile image', 'response_rate': '100%', 'response_time': '1h', 'name': 'Xavier Fournier', 'headline': 'Graphiste', 'categories': ['Graphiste'], 'competences': [], 'location': {'Localisation': 'Paris'}, 'presentation': 'À venir prochainement !', 'link': 'https://www.malt.fr/profile/xavierfournier', 'creation_date': '2014-01-06', 'profil': 'xavierfournier'}
Scraped 49 users, at 2023-12-07 17:20:10.797755 last one is: {'price': '500\xa0€', 'experience': '-', 'profile_image': 'No profile image', 'response_rate': '100%', 'response_time': '1h', 'name': 'Edouard Legret', 'headline': 'Ingénieur développeur', 'categories': [], 'competences': [], 'location': {'Localisation': 'Rennes, France'}, 'presentation': 'Bonjour,Je suis développeur Web depuis 12 ans sur la région Bretagne.', 'link': 'https://www.malt.fr/profile/edouardlegret', 'creation_date': '2014-02-04', 'profil': 