TODO:
1. Улучшить качество, используя разные модели для русского и английского языка.
2. Итеративный подход (под вопросом) - находим кластеры, переименовываем, опять находим кластеры, переименовываем и т.д. пока количество кластеров не перестанет изменяться

In [22]:
import json
import os

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
base_dir = '../'

In [24]:
df = pd.read_csv(f'{base_dir}output/key_skill_frequencies.csv')

In [10]:
# df = df[df['key_skill_frequency'] >= 5]

In [25]:
key_skill_names = df['key_skill_name'].tolist()
key_skill_frequencies = dict(zip(df['key_skill_name'], df['key_skill_frequency']))

In [26]:
if os.path.exists(f'{base_dir}/models/distiluse-base-multilingual-cased-v1'):
    model = SentenceTransformer(f'{base_dir}/models/distiluse-base-multilingual-cased-v1')

else:
    model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1")
    model.save(f'{base_dir}/models/distiluse-base-multilingual-cased-v1')

In [14]:
embeddings = model.encode(df['key_skill_name'].tolist())

In [15]:
similarity_matrix = cosine_similarity(embeddings)

In [16]:
def cluster_skills_with_max_similarity(similarity_matrix, threshold):
    n = similarity_matrix.shape[0]
    clusters = []
    
    for i in range(n):
        max_similarity = 0
        best_cluster = None
        
        for cluster in clusters:
            cluster_indices = list(cluster)
            min_similarity = similarity_matrix[i, cluster_indices].min()
            
            if min_similarity >= threshold:
                avg_similarity = similarity_matrix[i, cluster_indices].mean()
                
                if avg_similarity > max_similarity:
                    max_similarity = avg_similarity
                    best_cluster = cluster
        
        if best_cluster is not None:
            best_cluster.add(i)
        else:
            clusters.append(set([i]))
    
    return clusters

In [17]:
clusters = cluster_skills_with_max_similarity(similarity_matrix, 0.8)

In [18]:
key_skill_clusters = [[key_skill_names[i] for i in cluster] for cluster in clusters]

In [19]:
for i, cluster in enumerate(key_skill_clusters):
    print(f"Кластер {i + 1}: {cluster}")

Кластер 1: ['Linux', 'Unix/Linux', 'Unix']
Кластер 2: ['SQL', 'MySQL', 'NoSQL']
Кластер 3: ['GitHub', 'Git', 'Gitlab']
Кластер 4: ['PostgreeSQL', 'PostgreSQL', 'СУБД PostgreSQL', 'PostrgreSQL', 'PostgreSQL.', 'PostreSQL', 'PostgeSQL', 'PostgresSQL', 'PosgreSQL']
Кластер 5: ['Python, Bash', 'Python']
Кластер 6: ['Docker Compose', 'Docker', 'Docker-compose']
Кластер 7: ['JS', 'JavaScript']
Кластер 8: ['kubernates', 'Kubernetes', 'Kubernets', 'Kubernates', 'Kuberenetes']
Кластер 9: ['TypeScript', 'ТypeScript']
Кластер 10: ['CI\\CD', 'CI/CD', 'CI/СD']
Кластер 11: ['Java', 'Java SE', 'Java EE']
Кластер 12: ['React hook', 'React', 'ReactNative', 'React Native']
Кластер 13: ['Ansible']
Кластер 14: ['DevSecOps', 'DevOps']
Кластер 15: ['PHP8', 'PHP7', 'PHP', 'PHP5']
Кластер 16: ['управление проектом', 'Agile Project Management', 'Project management', 'Проектный менеджмент', 'Управление проектами', 'Проектное управление', 'Управление проектом', 'проектное управление']
Кластер 17: ['Bash']
Класте

In [20]:
replacements = {}

for key_skill_cluster in key_skill_clusters:
    most_popular_skill = max(key_skill_cluster, key=lambda key_skill_name: key_skill_frequencies.get(key_skill_name, 0))
    
    for key_skill_name in key_skill_cluster:
        replacements[key_skill_name] = most_popular_skill

In [21]:
with open(f'{base_dir}/output/replacements.json', 'w+', encoding='utf-8') as f:
    json.dump(replacements, f, ensure_ascii=False, indent=4)