TODO:
1. Улучшить качество, используя разные модели для русского и английского языка.
2. Итеративный подход (под вопросом) - находим кластеры, переименовываем, опять находим кластеры, переименовываем и т.д. пока количество кластеров не перестанет изменяться

In [1]:
import json

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_dir = '../output/'

In [3]:
df = pd.read_csv(f'{base_dir}key_skill_frequencies.csv')

In [4]:
# df = df[df['key_skill_frequency'] >= 5]

In [5]:
key_skill_names = df['key_skill_name'].tolist()
key_skill_frequencies = dict(zip(df['key_skill_name'], df['key_skill_frequency']))

In [6]:
model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1")

In [7]:
embeddings = model.encode(df['key_skill_name'].tolist())

In [8]:
similarity_matrix = cosine_similarity(embeddings)

In [9]:
def cluster_skills_with_max_similarity(similarity_matrix, threshold):
    n = similarity_matrix.shape[0]
    clusters = []
    
    for i in range(n):
        max_similarity = 0
        best_cluster = None
        
        for cluster in clusters:
            cluster_indices = list(cluster)
            min_similarity = similarity_matrix[i, cluster_indices].min()
            
            if min_similarity >= threshold:
                avg_similarity = similarity_matrix[i, cluster_indices].mean()
                
                if avg_similarity > max_similarity:
                    max_similarity = avg_similarity
                    best_cluster = cluster
        
        if best_cluster is not None:
            best_cluster.add(i)
        else:
            clusters.append(set([i]))
    
    return clusters

In [10]:
clusters = cluster_skills_with_max_similarity(similarity_matrix, 0.8)

In [11]:
key_skill_clusters = [[key_skill_names[i] for i in cluster] for cluster in clusters]

In [12]:
for i, cluster in enumerate(key_skill_clusters):
    print(f"Кластер {i + 1}: {cluster}")

Кластер 1: ['Linux', 'Unix/Linux', 'Unix']
Кластер 2: ['MySQL', 'SQL', 'NoSQL']
Кластер 3: ['Git', 'Gitlab', 'GitHub']
Кластер 4: ['PostrgreSQL', 'PostgreSQL', 'PosgreSQL', 'PostgresSQL', 'СУБД PostgreSQL', 'PostgreeSQL', 'PostreSQL', 'PostgeSQL', 'PostgreSQL.']
Кластер 5: ['python3', 'Python', 'Python 3.x', 'Python 3']
Кластер 6: ['Docker-compose', 'Docker', 'Docker Compose']
Кластер 7: ['JS', 'JavaScript']
Кластер 8: ['Kustomize', 'Kubernetes']
Кластер 9: ['TypeScript', 'ТypeScript']
Кластер 10: ['CI/CD', 'CI/СD', 'CI\\CD']
Кластер 11: ['React Native', 'React', 'ReactNative', 'react.native']
Кластер 12: ['Java EE', 'Java', 'Java SE']
Кластер 13: ['PHP', 'PHP5', 'php8', 'PHP8', 'PHP7']
Кластер 14: ['Ansible']
Кластер 15: ['DevSecOps', 'DevOps']
Кластер 16: ['RESTful API', 'REST API/Open API', 'REST API']
Кластер 17: ['Проектное управление', 'Agile Project Management', 'проектное управление', 'Проектный менеджмент', 'Управление проектами', 'Project management', 'Управление проектом', '

In [13]:
replacements = {}

for key_skill_cluster in key_skill_clusters:
    most_popular_skill = max(key_skill_cluster, key=lambda key_skill_name: key_skill_frequencies.get(key_skill_name, 0))
    
    for key_skill_name in key_skill_cluster:
        replacements[key_skill_name] = most_popular_skill

In [14]:
with open(f'{base_dir}replacements.json', 'w+', encoding='utf-8') as f:
    json.dump(replacements, f, ensure_ascii=False, indent=4)