In [3]:
# !pip install selenium

In [5]:
!docker-compose up -d

 Network hajy8_liste-levee-de-fonds-startup-france_scraping_project_default  Creating
 Network hajy8_liste-levee-de-fonds-startup-france_scraping_project_default  Created
 Container selenium-hub-hajy8_letudiant-project  Creating
 Container selenium-hub-hajy8_letudiant-project  Created
 Container hajy8_liste-levee-de-fonds-startup-france_scraping_project-chrome_1_hajy8_letudiant_project-1  Creating
 Container hajy8_liste-levee-de-fonds-startup-france_scraping_project-chrome_1_hajy8_letudiant_project-1  Created
 Container selenium-hub-hajy8_letudiant-project  Starting
 Container selenium-hub-hajy8_letudiant-project  Started
 Container hajy8_liste-levee-de-fonds-startup-france_scraping_project-chrome_1_hajy8_letudiant_project-1  Starting
 Container hajy8_liste-levee-de-fonds-startup-france_scraping_project-chrome_1_hajy8_letudiant_project-1  Started


In [None]:
from StartupScraping import StartupScraping
from StartupStorage import StartupStorage
from Startup import Startup

from contact_links_classification.ContactLinkModel import ContactLinkModel
from ContactOpenAIScraping import ContactOpenAIScraping
from pageProcessing import PageProcessing
from sentenceProcessing import SentenceProcessing

import pandas as pd
import time
import csv
import codecs
import threading

file_path = 'results/startups'

def main():
    startupScraping = StartupScraping(url="https://www.jaimelesstartups.fr/liste-levee-de-fonds-startup-france/", file_path=file_path, with_selenium_grid=False)
    result = startupScraping.start_scraping()
    print(result)

# def csv_to_dict_list(file_path):
#     with codecs.open(file_path, mode='r', encoding='utf-8-sig') as file:
#         csv_reader = csv.DictReader(file)
#         list_of_dicts = [row for row in csv_reader]
#     return list_of_dicts

# Increase the field size limit

csv.field_size_limit(2**31 - 1)

def csv_to_dict_list(file_path):
    # Read the CSV file with automatic delimiter detection and error handling
    try:
        df = pd.read_csv(file_path, on_bad_lines='skip')
        print(df.shape)
        #df = remove_duplicates_and_add_rang(df, ['startup_name'])
        list_of_dicts = df.to_dict(orient='records')
        # Remplacer \xa0 par un espace vide pour chaque valeur dans le dictionnaire
        for item in list_of_dicts:
            for key, value in item.items():
                if isinstance(value, str):  # Vérifie si la valeur est une chaîne de caractères
                    item[key] = value.replace('\xa0', ' ')  # Supprime \xa0
                    
        return list_of_dicts
    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")
        return []
    
def difference_dic(dic1, dic2):
    # Extraire les valeurs de 'Nom_de_l_entreprise' de dic2 dans un set pour une recherche rapide
    entreprises_dic2 = {d['startup_name'] for d in dic2}
    
    # Retourner les dictionnaires dans dic1 qui ne sont pas dans dic2 (selon 'Nom_de_l_entreprise')
    return [d for d in dic1 if d['startup_name'] not in entreprises_dic2]
    
# Fonction pour chaque thread qui pop un élément de la liste et traite le dictionnaire
def worker(list_of_dicts, lock_list_of_dicts, results, lock_results, for_task, lock_selenium_grid):
    if for_task == 'extract_contact_info' :
        contact_link_classifier = ContactLinkModel()
        contact_link_classifier.load_from_local(model_path='./contact_links_classification/Models/model_0/model_contact_40_maxlen_10_epochs')
        contactOpenAIScraping = ContactOpenAIScraping()
        pageProcessing = PageProcessing()
        sentenceProcessing = SentenceProcessing(max_words_before_phone_number_or_email=30)
        
    if for_task == 'extract_LinkedIn_profiles_of_company_founders' :
        with lock_selenium_grid:
            startupScraping = StartupScraping(with_selenium_grid = False)
        startupScraping.get_linkedin_authentication()
    
    while True:
        # Utilisation de 'with lock_list_of_dicts' pour synchroniser l'accès à la liste
        with lock_list_of_dicts:
            if list_of_dicts:
                startup_dic = list_of_dicts.pop(0)  # Récupérer un dictionnaire de la liste
            else:
                break  # Si la liste est vide, sortir de la boucle

        # Traiter le dictionnaire (en dehors du verrou pour ne pas bloquer les autres threads)
        startup = Startup()
        startup.init_from_dic(startup_dic)
        if for_task == 'scraping_more_inf' :
            with lock_selenium_grid:
                startupScraping = StartupScraping(url=startup.startup_more_inf_url, startup=startup)
            result = startupScraping.start_scraping_more_inf()
        if for_task == 'verify_startup_web_site_url' :
            with lock_selenium_grid:
                startupScraping = StartupScraping(url=startup.startup_web_site_url, startup=startup)
            result = startupScraping.start_verify_startup_web_site_url()
            
        if for_task == 'extract_contact_info' :
            with lock_selenium_grid:
                startupScraping = StartupScraping(url=startup.startup_right_web_site_url, startup=startup,
                                              contact_link_classifier=contact_link_classifier, contactOpenAIScraping=contactOpenAIScraping,
                                              pageProcessing=pageProcessing, sentenceProcessing=sentenceProcessing)
            result = startupScraping.get_all_contact_page_links()
            
        if for_task == 'extract_LinkedIn_profiles_of_company_founders':
            result = startupScraping.get_LinkedIn_profiles_of_company_founders(startup)
            
            if not  result['status']:
                with lock_selenium_grid:
                    startupScraping = StartupScraping(with_selenium_grid = False)
                startupScraping.get_linkedin_authentication()
                
        # print(result['data'])

        # Sauvegarder le résultat (en utilisant le verrou pour protéger l'accès à la liste des résultats)
        if result['status']:
            with lock_results:
                startupStorage = StartupStorage(f'{file_path}_with_more_inf_555')
                startupStorage.insert_startup(result['data'])
                startupStorage.close_file()
                results.append(result['data'])

# Fonction principale pour lancer les threads
def process_with_threads(list_of_dicts, num_threads=20, for_task = 'scraping_more_inf'):
    results = []
    # Deux verrous, un pour la liste de dictionnaires et un pour les résultats
    lock_list_of_dicts = threading.Lock()
    lock_results = threading.Lock()
    lock_selenium_grid = threading.Lock()

    # Créer et lancer les threads
    threads = []
    for _ in range(num_threads):
        t = threading.Thread(target=worker, args=(list_of_dicts, lock_list_of_dicts, results, lock_results, for_task, lock_selenium_grid))
        t.start()
        threads.append(t)

    # Attendre que tous les threads aient fini
    for t in threads:
        t.join()

    return results

def main_2():
        
    list_of_dicts_startups = csv_to_dict_list(f'{file_path}_with_more_inf_4.csv')
    # display(list_of_dicts_startups[0])
    result_dic = csv_to_dict_list(f'{file_path}_with_more_inf_55.csv')
    # display(result_dic[0])
    rest_dicts = difference_dic(list_of_dicts_startups, result_dic)

    result_dic = csv_to_dict_list(f'{file_path}_with_more_inf_555.csv')
    # display(result_dic[0])
    
    rest_dicts = difference_dic(rest_dicts, result_dic)
    print(len(rest_dicts))
    #print(rest_dicts[0])
    
    results = process_with_threads(rest_dicts, num_threads=1, for_task='extract_LinkedIn_profiles_of_company_founders')
    
    print(f'*'*150)
    
    startupStorage = StartupStorage(f'{file_path}_with_more_inf_56')
    startupStorage.insert_startups(results)
    startupStorage.close_file()


    
if __name__ == "__main__":
    # main()
    # # Lire le fichier CSV
    # df = pd.read_csv(f'{file_path}.csv')
    # # Enregistrer dans un fichier Excel
    # df.to_excel(f'{file_path}.xlsx', index=False, engine='openpyxl')
    
    main_2()
    # # Lire le fichier CSV
    df = pd.read_csv(f'{file_path}_with_more_inf_56.csv')
    # # Enregistrer dans un fichier Excel
    df.to_excel(f'{file_path}_with_more_inf_56.xlsx', index=False, engine='openpyxl')

(1281, 17)
(626, 21)
(486, 22)
176
yeeeees : if button.text.lower() == 'afficher plus de résultats':
Posos : 61
Tomojo annonce une levée de fonds de 3 millions € : 2
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher plus de résultats':
Cysec annonce une levée de fonds de 3,8 millions d’euros : 40
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher plus de résultats':
yeeeees : if button.text.lower() == 'afficher 

In [1]:
a = [1,2]
a += [3,4]
a

[1, 2, 3, 4]

In [None]:
from pageProcessing import PageProcessing

url = "https://gulfnews.com/contact-us"

pageProcessing = PageProcessing()
clean_html_text = pageProcessing.get_clean_html_text_from_url(url)
clean_html_text

Start get_source_page_from_url


In [17]:
from sentenceProcessing import SentenceProcessing
sentenceProcessing = SentenceProcessing(max_words_before_phone_number_or_email=30)

new_clean_text = sentenceProcessing.get_new_clean_text(clean_html_text)
print(f"number of words in the new_clean_text : {len( str(new_clean_text).split() )}\n\n")
print(new_clean_text)

number of words in the new_clean_text : 259


Go Trending © Al Nisr Publishing LLC 2024. All rights reserved. Monday, October 7, 2024 Connect with the Newsroom
To share a news tip, photo, opinion or video with us, email it on readers@gulfnews.com.You can use the same email address to inform us, if you come across any factual errors or mistakes, have feedback on the process or any queries on the content. You can also message us on Facebook or Instagram. Not a fan of social media, call us on 04 4067666.
Contact Customer Care Digital
Facing issues signing in or creating an account with us? Email us on DigitalSubscriptions@gulfnews.com Or call us on 600 599901 Or whatsapp us on 0505591961.
You can also use the form below to submit your query. Print
Having issues with your print subscription? Call us on 600 587234.For help with more complicated questions, email us at circul@gulfnews.com
Advertise with us
To advertise with the Gulf News, visit our  For inquiries, email us at: digitaladvt@gul

In [1]:
dic = {'a':1,'b':'2'}
type(dic.get('c'))

NoneType

In [17]:
import time, random
print(random.uniform(0.5, 3.9))
time.sleep(random.uniform(0.5, 3.9))

1.1716691859363944


## test Proxy

In [6]:
import requests

# Détails du proxy (avec ou sans authentification)
proxy = f"http://yedzvwaz:5es5islnfwxo@161.123.152.67:6312"

proxy = {
    "http": proxy,
    "https": proxy
}

# URL de test (par exemple, un service qui retourne ton IP)
url = "http://httpbin.org/ip"

try:
    response = requests.get(url, proxies=proxy, timeout=5)
    print("Statut:", response.status_code)
    print("IP retournée par le proxy:", response.json())
except Exception as e:
    print(f"Le proxy a échoué: {e}")


Statut: 200
IP retournée par le proxy: {'origin': '161.123.152.67'}
