In [1]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

from cuml.cluster import KMeans as cuKMeans
from cuml.ensemble import RandomForestClassifier as cuRF

import sys
sys.path.append('/kaggle/input/kaggleword2vecutility/')
from KaggleWord2VecUtility import KaggleWord2VecUtility

def create_bag_of_centroids(wordlist, word_centroid_map):
    num_centroids = max(word_centroid_map.values()) + 1
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")

    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1

    return bag_of_centroids

In [2]:
if __name__ == '__main__':
    model = Word2Vec.load('/kaggle/input/300features-40minwords-10context/300features_40minwords_10context')

    start = time.time()

    word_vectors = model.wv.vectors
    num_clusters = word_vectors.shape[0] // 5  # 整除

    print("Running K means")
    kmeans_clustering = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
    idx = kmeans_clustering.fit_predict(word_vectors)

    end = time.time()
    elapsed = end - start
    print("Time taken for K Means clustering:", elapsed, "seconds.")

    word_centroid_map = dict(zip(model.wv.index_to_key, idx))

    for cluster in range(10):
        print("\nCluster %d" % cluster)
        words = [word for word, cluster_idx in word_centroid_map.items() if cluster_idx == cluster]
        print(words)

    train = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv/labeledTrainData.tsv',
                        header=0, delimiter="\t", quoting=3)
    test = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/testData.tsv/testData.tsv',
                       header=0, delimiter="\t", quoting=3)

    print("Cleaning training reviews")
    clean_train_reviews = [KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True)
                           for review in train["review"]]

    print("Cleaning test reviews")
    clean_test_reviews = [KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True)
                          for review in test["review"]]

    train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32")
    for i, review in enumerate(clean_train_reviews):
        train_centroids[i] = create_bag_of_centroids(review, word_centroid_map)

    test_centroids = np.zeros((test["review"].size, num_clusters), dtype="float32")
    for i, review in enumerate(clean_test_reviews):
        test_centroids[i] = create_bag_of_centroids(review, word_centroid_map)

    forest = RandomForestClassifier(n_estimators=100, random_state=42)
    print("Fitting a random forest to labeled training data...")
    forest.fit(train_centroids, train["sentiment"])
    result = forest.predict(test_centroids)

    output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
    output.to_csv("/kaggle/working/BagOfCentroids.csv", index=False, quoting=3)
    print("Wrote /kaggle/working/BagOfCentroids.csv")


Running K means
Time taken for K Means clustering: 574.2157273292542 seconds.

Cluster 0
['whereas', 'glory', 'undoubtedly', 'alongside', 'saga', 'imitation', 'arguably', 'outing', 'contribution', 'demille', 'comeback', 'foremost', 'pairing', 'blockbusters', 'greats', 'biopic', 'forties', 'filmography', 'earliest', 'creations', 'romances', 'nineties', 'twenties', 'vaudeville', 'achievements', 'warners', 'icons', 'output', 'talkie', 'offerings', 'schindler', 'talkies', 'twentieth', 'seller', 'successes', 'heyday', 'operas', 'disappointments', 'viii', 'ealing', 'noirs', 'outings', 'silents', 'swashbuckler', 'singin', 'melodramas', 'collaborations']

Cluster 1
['directly', 'blank', 'visible', 'randomly', 'boom', 'rear']

Cluster 2
['idiots', 'losers', 'fools', 'morons']

Cluster 3
['dates', 'cons', 'rivals', 'geeks', 'professionals', 'pros', 'queens', 'arguing', 'competing', 'participants', 'somethings', 'clowns', 'amateurs', 'troupe', 'unknowns', 'sessions', 'giants', 'misfits', 'premise