# word2vecの分類　（dbscan Scikit-learn）

Scikit-learnを使用してクラスター分類を行う。
dbscanを調べる。

In [1]:
# 使用ライブラリのインストール

!pip install -U scikit-learn

!pip install --upgrade gensim

!pip install  --upgrade mecab

!brew install mecab
!brew install mecab-ipadic

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
[34m==>[0m [1mDownloading https://formulae.brew.sh/api/formula.jws.json[0m
######################################################################### 100.0%
[34m==>[0m [1mDownloading https://formulae.brew.sh/api/cask.jws.json[0m
######################################################################### 100.0%
To reinstall 0.996, run:
  brew reinstall mecab
To reinstall 2.7.0-20070801, run:
  brew reinstall mecab-ipadic


In [4]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import silhouette_score
from gensim.models import KeyedVectors
from sklearn.cluster import DBSCAN
import numpy as np

def dbscan_cluster_words_with_silhouette(model_path, batch_size, eps_range, min_samples_range):
    # Word2Vecモデルの読み込み
    model = KeyedVectors.load_word2vec_format(model_path, binary=False)

    # 全単語のベクトルを取得
    word_vectors = model.vectors
    num_words = len(word_vectors)

    # eps_range と min_samples_range の組み合わせに対するシルエットスコアを保存
    silhouette_scores = []

    for eps in eps_range:
        for min_samples in min_samples_range:
            print(f"Processing DBSCAN with eps={eps}, min_samples={min_samples}...")

            # バッチ処理を準備
            word_cluster_map = {}
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)

            # バッチごとにクラスタリングを実行
            for start_idx in range(0, num_words, batch_size):
                # バッチ分割
                end_idx = min(start_idx + batch_size, num_words)
                batch_vectors = word_vectors[start_idx:end_idx]

                # DBSCANクラスタリングの実行
                cluster_indices = dbscan.fit_predict(batch_vectors)

                # バッチごとに単語とクラスタの対応を保存
                for i, word in enumerate(model.index_to_key[start_idx:end_idx]):
                    word_cluster_map[word] = cluster_indices[i]

            # 有効なクラスタ数がある場合のみシルエットスコアを計算
            unique_clusters = set(word_cluster_map.values())
            if len(unique_clusters) > 1:  # 少なくとも2つのクラスタが必要
                cluster_labels = np.array([word_cluster_map[word] for word in model.index_to_key])
                score = silhouette_score(word_vectors, cluster_labels)
                silhouette_scores.append((eps, min_samples, score))
                print(f"Silhouette Score for eps={eps}, min_samples={min_samples}: {score}")
            else:
                silhouette_scores.append((eps, min_samples, None))
                print(f"No valid clusters for eps={eps}, min_samples={min_samples}")

    # 結果を3Dプロット
    silhouette_scores = np.array(silhouette_scores)
    eps_values = silhouette_scores[:, 0]
    min_samples_values = silhouette_scores[:, 1]
    scores = silhouette_scores[:, 2]

    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    sc = ax.scatter(eps_values, min_samples_values, scores, c=scores, cmap='viridis', s=50)
    ax.set_xlabel('Eps')
    ax.set_ylabel('Min Samples')
    ax.set_zlabel('Silhouette Score')
    ax.set_title('Silhouette Scores for DBSCAN with Varying Eps and Min Samples')
    plt.colorbar(sc, ax=ax, label='Silhouette Score')
    plt.show()


In [6]:
# check

model_path = '../data/japanese_word2vec_vectors.vec'
batch_size = 1000
eps_range = np.linspace(0.1, 1.0, 10)  # epsの範囲を指定
min_samples_range = range(2, 21)       # min_samplesの範囲を指定

dbscan_cluster_words_with_silhouette(model_path, batch_size, eps_range, min_samples_range)

Processing DBSCAN with eps=0.1, min_samples=2...
