In [2]:
import pandas as pd
import spacy 
import nltk
nltk.download('punkt')
nlp = spacy.load('ru_core_news_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\velic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
namesTeachers = pd.read_csv('D:/Programming/Onomastiki/flaskProject/templates/data/namesTeachers.csv')
namesStudents = pd.read_csv('D:/Programming/Onomastiki/flaskProject/templates/data/namesStudents.csv')
namesCombined = pd.read_csv('D:/Programming/Onomastiki/flaskProject/templates/data/namesCombined.csv')

In [6]:
def tokenize_data(data):
    # Применение токенизации к столбцу "Meaning"
    tokenized_meanings = []
    for text in data["Meaning"]:
        doc = nlp(text)
        tokens = [token.text for token in doc if not token.is_punct]  # Получение токенов без пунктуации
        tokenized_meanings.append(tokens)

    # Добавление токенизированных данных в DataFrame
    data["Tokenized_Meaning"] = tokenized_meanings
    return data

# Пример использования функции
colName = 'Meaning'
namesTeachers = tokenize_data(namesTeachers)
namesStudents = tokenize_data(namesStudents)
namesCombined = tokenize_data(namesCombined)
# namesCombined.head(10)

In [7]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
# Обучение моделей для каждого источника
model_students = Word2Vec(sentences=namesStudents['Tokenized_Meaning'].tolist(), vector_size=100, window=5, min_count=1, workers=4)
model_teachers = Word2Vec(sentences=namesTeachers['Tokenized_Meaning'].tolist(), vector_size=100, window=5, min_count=1, workers=4)
model_combined = Word2Vec(sentences=namesCombined['Tokenized_Meaning'].tolist(), vector_size=100, window=5, min_count=1, workers=4)

model_students.save("word2vec_students.model")
model_teachers.save("word2vec_teachers.model")
model_combined.save("word2vec_combined.model")


In [8]:
from gensim.models import Word2Vec
import numpy as np

def vectorize_meanings(data, model_path):
    vectorized_meanings = []
    # Загрузка ранее сохраненной модели Word2Vec
    model = Word2Vec.load(model_path)
    # Векторизация столбца "Tokenized_Meaning"
    for tokens in data["Tokenized_Meaning"]:
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        mean_vector = np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
        vectorized_meanings.append(mean_vector)
    # Добавление векторизованных значений обратно в DataFrame
    data["Vectors"] = vectorized_meanings
    return data

# Векторизация данных с использованием функции vectorize_meanings
model_students_path = "word2vec_students.model"
model_teachers_path = "word2vec_teachers.model"  
model_combined_path = "word2vec_combined.model"# Путь к обученной модели Word2Vec
namesStudents = vectorize_meanings(namesStudents, model_students_path)
namesTeachers = vectorize_meanings(namesTeachers, model_teachers_path)
namesCombined = vectorize_meanings(namesCombined, model_combined_path)


In [9]:
import numpy as np
def k_means(data, k):
    # Инициализация центроид
    centroids = data[np.random.choice(range(len(data)), size=k)]

    while True:
        # Выделение кластеров
        clusters = [[] for i in range(k)]
        for point in data:
            distances = [np.linalg.norm(point - centroid) for centroid in centroids]
            closest_centroid_idx = np.argmin(distances)
            clusters[closest_centroid_idx].append(point)

        # Обновление центроид
        new_centroids = []
        for cluster in clusters:
            if len(cluster) > 0:
                cluster_mean = np.mean(cluster, axis=0)
                new_centroids.append(cluster_mean)

        # Проверка на сходимость
        if np.allclose(centroids, new_centroids):
            break

        centroids = new_centroids

    return centroids

# Пример использования
students = np.array(namesStudents['Vectors'].tolist(), dtype=float)
teachers = np.array(namesTeachers['Vectors'].tolist(), dtype=float)
combination = np.array(namesCombined['Vectors'].tolist(), dtype=float)
k = 1
centroids_students = k_means(students, k)
centroids_teachers = k_means(teachers, k)
centroids_comb = k_means(combination, k)

In [10]:
def find_closest_vector(centroid, vectors):
    min_distance = float('inf')
    closest_vector = None
    
    for vector in vectors:
        distance = np.linalg.norm(centroid - vector)
        
        if distance < min_distance:
            min_distance = distance
            closest_vector = vector
    
    return closest_vector

vectorS = namesStudents['Vectors']
vectorT = namesTeachers['Vectors']
vectorC = namesCombined['Vectors']
closest_vector_students = find_closest_vector(centroids_students, vectorS)
closest_vector_teachers = find_closest_vector(centroids_teachers, vectorT)
closest_vector_combined = find_closest_vector(centroids_comb, vectorC)

In [11]:
decodedStudent = closest_vector_students
similar_word = model_students.wv.most_similar(positive=[closest_vector_students], topn=1)
print('Ближайший токен:', similar_word[0][0])

Ближайший токен: с


In [12]:
decodedTeacher = closest_vector_teachers
similarW2 = model_teachers.wv.most_similar(positive=[closest_vector_teachers], topn=1)
print('Ближайший токен:', similarW2[0][0])

Ближайший токен: поток


In [13]:
decodedCombined = closest_vector_combined
similarW1 = model_combined.wv.most_similar(positive=[decodedCombined], topn=1)
print('Ближайший токен:',similarW1[0][0])

Ближайший токен: Бога
