In [35]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline

In [36]:
df_societe = pd.read_csv('societes.csv')

In [37]:
# Utiliser `get_dummies` pour créer des colonnes de mots clés
keywords_dummies = df_societe['mots_cles_def'].str.get_dummies(sep=', ')
market_dummies = df_societe['market'].str.get_dummies(sep=', ')
activite_dummies = df_societe['Activité principale'].str.get_dummies(sep=', ')

In [38]:
print(type(keywords_dummies)) 
print(type(market_dummies))  
print(type(activite_dummies))  

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [39]:
def create_and_train_pipeline(X_extended):
    # Réindexation par sécurité
    X_extended.reset_index(drop=True, inplace=True)
    
    # Initialisation et entraînement du modèle KNN
    pipeline = Pipeline([
        ('knn', NearestNeighbors(n_neighbors=16, metric='manhattan'))
    ])
    pipeline.fit(X_extended)

    return pipeline

In [40]:
X_extended = pd.concat([keywords_dummies, market_dummies, activite_dummies], axis=1)
pipeline = create_and_train_pipeline(X_extended)

In [41]:
def recommend_societes(entreprise_id, data, X_extended, pipeline):
    """ Recommande des sociétés similaires à partir des mots-clés. """
    
    # Vérifier si l'ID de la société existe dans les données
    if entreprise_id not in data['entreprise_id'].values:
        print(f" {entreprise_id} n'existe pas")
        return []

    # Trouver l'index de la société dans le DataFrame
    entreprise_index = data.index[data['entreprise_id'] == entreprise_id].tolist()[0]

    # Extraire les caractéristiques de la société
    entreprise_data = X_extended.loc[entreprise_index].to_frame().T

    # Trouver les sociétés les plus similaires
    distances, indices = pipeline.named_steps['knn'].kneighbors(entreprise_data)

    # Récupérer les voisins
    voisins = data.iloc[indices[0]].copy()
    voisins['Distance'] = distances[0]

    # Exclure la société d'origine des recommandations
    voisins = voisins[voisins['entreprise_id'] != entreprise_id]

    # Trier les recommandations par proximité
    voisins = voisins.sort_values(by='Distance')

    # Construire la liste des sociétés recommandées
    recommended_societes = []
    for _, row in voisins.iterrows():
        recommended_societes.append({
            "id": row['entreprise_id'],
            "nom": row['nom'],
            "mot_cles": row['mots_cles_def'],
            "description": row['description'],
            "market": row['market'],
            "activite": row['Activité principale'],
            "distance": round(row['Distance'], 3)
        })

    return recommended_societes

In [42]:
# 🔹 Exemple d'utilisation :
entreprise_id_test = 534  # Remplace par un ID existant dans ton dataset
reco = recommend_societes(entreprise_id_test, df_societe, X_extended, pipeline)

# 🔹 Affichage des recommandations
for i, societe in enumerate(reco[:10], 1):
    print(f"{i}. {societe['nom']} {societe['description']} {societe['mot_cles']} {societe['mot_cles']} {societe['activite']} (Distance: {societe['distance']})")

1. LYTID Developing, manufacturing and commercializing terahertz technologies for scientific and industrial applications deep tech, hardware, manufacturing, quantum technologies, selling own inventory, semiconductors deep tech, hardware, manufacturing, quantum technologies, selling own inventory, semiconductors nan (Distance: 3.0)
2. HOASYS Developing Optical Systems for the industry and medicine customers, as a cost effective technological step forward deep tech, hardware, manufacturing, selling own inventory, semiconductors deep tech, hardware, manufacturing, selling own inventory, semiconductors nan (Distance: 4.0)
3. KEYSOM Keysom develops RISC-V based processor architectures that are automatically built from the application source code and dynamically reconfigurable deep tech, hardware, saas, semiconductors, subscription deep tech, hardware, saas, semiconductors, subscription nan (Distance: 4.0)
4. AIRMEMS Offers innovative electronic switching solutions that shrink size, reduce p