In [1]:
from aiden_app.tools.utils.wtj_scraper import WelcomeToTheJungleScraper, JobOffer
import pydantic
from mistralai.client import MistralClient, EmbeddingResponse
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
scraper = WelcomeToTheJungleScraper()

[32m2024-05-10 17:10:13.009[0m | [1mINFO    [0m | [36maiden_app.tools.utils.wtj_scraper[0m:[36m__init__[0m:[36m146[0m - [1msuccesfully initialized WTJ scraper[0m


In [3]:
ml_eng_results = scraper._fetch_results(search_query="Données/Business Intelligence", location="Paris")

In [4]:
len(ml_eng_results)

1000

In [5]:
marketing_results = scraper._fetch_results(search_query="marketing", location="Paris")

In [6]:
po_results = scraper._fetch_results(search_query="product owner", location="Bordeaux")

In [7]:
boucher_results = scraper._fetch_results(search_query="Boucher", location="Reims")

In [8]:
client = MistralClient(api_key="PVFKaHc9pK0Gyj2AaQfUVAsQjFYlLJFp")

In [9]:
def embed_offer_list(offers: list[JobOffer], chunk_size: int = 5) -> list[EmbeddingResponse]:
    embeddings: list[EmbeddingResponse] = []
    chunks: list[list[JobOffer]] = [offers[i : i + chunk_size] for i in range(0, len(offers), chunk_size)]
    for chunk in chunks:
        embeddings_batch_response = client.embeddings(
            model="mistral-embed",
            input=[offer.metadata_repr() for offer in chunk],
        )
        embeddings += embeddings_batch_response.data
    return embeddings

In [10]:
boucher_results

[JobOffer(benefits=[], contract_duration_maximum=None, contract_duration_minimum=None, contract_type='full_time', education_level=None, experience_level_minimum=None, has_contract_duration=False, has_education_level=False, has_experience_level_minimum=False, has_remote=True, has_salary_yearly_minimum=True, language='fr', name='Boucher F/H', new_profession=Profession(category_name='Entretien et réparation', sub_category_name="Services généraux d'entretien et de réparation", sub_category_reference='general-maintenance-and-repair-wYjIw'), offices=[Office(country='France', local_city='Crest', local_state='Auvergne-Rhône-Alpes')], organization=Organization(description='L’expérience Auchan est une expérience que nous créons… ensemble !\n\nC’est une expérience collective où l’esprit d’équipe prime, une expérience épanouissante où chacun peut créer son propre parcours, une expérience engagée où l’on s’investit dans la quête d’un objectif commun : AMÉLIORER LE QUOTIDIEN ! Avec près de 700 magas

In [11]:
markerting_embeddings = embed_offer_list(marketing_results, chunk_size=50)
ml_eng_embeddings = embed_offer_list(ml_eng_results, chunk_size=50)
po_embeddings = embed_offer_list(po_results, chunk_size=50)
boucher_embeddings = embed_offer_list(boucher_results, chunk_size=50)

In [27]:
from dataclasses import dataclass

@dataclass
class Offer():
    embedding: EmbeddingResponse
    offer: JobOffer


offers = []
for i, embedding in enumerate(ml_eng_embeddings):
    offers.append(Offer(embedding=embedding, offer=ml_eng_results[i]))
for i, embedding in enumerate(boucher_embeddings):
    offers.append(Offer(embedding=embedding, offer=boucher_results[i]))
for i, embedding in enumerate(markerting_embeddings):
    offers.append(Offer(embedding=embedding, offer=marketing_results[i]))
for i, embedding in enumerate(po_embeddings):
    offers.append(Offer(embedding=embedding, offer=po_results[i]))

In [39]:
def get_text_embedding(input):
    embeddings_batch_response = client.embeddings(model="mistral-embed", input=input)
    return embeddings_batch_response.data[0].embedding


def get_k_most_similar_offers(reference_embedding, offers, k=5):
    similar_offers = []
    for offer in offers:
        distance = euclidean_distances([offer.embedding.embedding], [reference_embedding])[0][0]
        similar_offers.append((offer, distance))

    # Sort the offers based on distance in ascending order
    similar_offers.sort(key=lambda x: x[1])

    # Return the top k most similar offers
    return similar_offers[:k]


reference_sentence = "I'm looking for an internship in London."
reference_embedding = get_text_embedding(reference_sentence)

In [40]:
k_most_similar_offers = get_k_most_similar_offers(reference_embedding, offers, k=10)
for offer, distance in k_most_similar_offers:
    print("Distance:", distance)
    print("Offer:", offer.offer.metadata_repr())

Distance: 0.7367216294390317
Offer: This job offer named 'Marketing Intern - UK Market' was published on May 08, 2024. It is from the company 'Aircall'. It is located in London, United Kingdom. Sectors: SaaS / Cloud Services, Electronique / Télécommunications.
Distance: 0.7501276634233847
Offer: This job offer named 'B2B Business developer UK - Internship' was published on May 03, 2024. It is from the company 'Club Employés'. It is located in London, United Kingdom, Marseille, France, Lyon, France, Paris, France. Sectors: SaaS / Cloud Services, Art de vivre.
Distance: 0.7535626826119545
Offer: This job offer named 'Internship - Marketing Assistant M/F' was published on April 29, 2024. It is from the company 'Onepilot'. It is located in Paris, France. Sectors: Intelligence artificielle / Machine Learning, SaaS / Cloud Services, Accompagnement d'entreprises.
Distance: 0.753813184401199
Offer: This job offer named 'Marketing & Communication - Internship - Paris' was published on May 06, 2

In [18]:
offers[1555].offer

JobOffer(benefits=[], contract_duration_maximum=None, contract_duration_minimum=None, contract_type='internship', education_level=None, experience_level_minimum=None, has_contract_duration=False, has_education_level=False, has_experience_level_minimum=False, has_remote=True, has_salary_yearly_minimum=False, language='fr', name='Rédacteur / Rédactrice Marketing - Stage - Barcelone', new_profession=Profession(category_name='Communication, marketing et publicité', sub_category_name='Planification et stratégie marketing', sub_category_reference='marketing-strategy-and-planning-wNWU0'), offices=[Office(country='Spain', local_city='Barcelona', local_state='Catalunya')], organization=Organization(description="Eux, c'est papernest.\r\n\r\nCertains évoqueront les challenges ambitieux, d’autres, l’inégalable esprit d’équipe ou encore l’ambiance de travail survoltée, voire les trois en même temps. Une chose est sûre : chez eux, on ne s’ennuie pas. \r\n\r\nCe qui les anime : imaginer leurs prochai

In [18]:
offers[0].offer.requirements_repr()

"The profile sought for this position is: '<ul><li>5 ans d&#39;expérience ou plus en analyse quantitative, de préférence dans le secteur du jeu vidéo</li><li>Baccalauréat ou diplôme supérieur en mathématiques, informatique décisionnelle, économie, génie ou un autre champ technique</li><li>Excellente compréhension des pratiques commerciales et de la monétisation</li><li>Excellentes compétences sur SQL\xa0: expérience poussée en recherche dans des listes de données volumineuses et complexes</li><li>Connaissance d&#39;un langage de script, que ce soit R ou Python, un atout</li><li>Capacité démontrée à travailler dans un environnement dynamique, à respecter des échéanciers fluctuants et à gérer les priorités sur plusieurs projets simultanés</li><li>Excellent sens de l&#39;organisation, de la communication et des relations</li><li>À l&#39;aise et confiant pour offrir des présentations et communiquer des analyses</li><li>Excellente maîtrise de l&#39;anglais, écrit et oral (obligatoire)</li><

In [36]:
len(offers)

3592

In [None]:

offer.