Assets can be found at https://www.kaggle.com/c/word2vec-nlp-tutorial

In [5]:
import pandas as pd
import numpy as np

# Read data from files
train = pd.read_csv(
    "datasets/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3
)

test = pd.read_csv("datasets/testData.tsv", header=0, delimiter="\t", quoting=3)

unlabeled_train = pd.read_csv(
    "datasets/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3
)

# Verify the number of reviews that were read (100,000 in total)
print(
    "Read %d labeled train reviews, %d labeled test reviews, "
    "and %d unlabeled reviews\n"
    % (train["review"].size, test["review"].size, unlabeled_train["review"].size)
)

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [6]:
# Import various modules for string cleaning
import re
import warnings
from nltk.corpus import stopwords
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
stops = set(stopwords.words("english"))


def review_to_wordlist(review):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z0-9]", " ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 5. Return a list of words
    return words

In [7]:
import nltk.data

# nltk.download()

# Load the punkt tokenizer
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")


# Define a function to split a review into parsed sentences
def review_to_sentences(review, tokenizer):
    # Function to split a review into parsed sentences. Returns a
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = list(
        map(
            lambda raw_sentence: review_to_wordlist(raw_sentence),
            filter(lambda x: len(x) > 0, raw_sentences),
        )
    )
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [8]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set
Parsing sentences from unlabeled set


In [None]:
# Import the built-in logging module and configure it so that Word2Vec
# creates nice output messages
import logging

logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

# Set values for various parameters
num_features = 1000  # Word vector dimensionality
min_word_count = 20  # Minimum word count
num_workers = 4  # Number of threads to run in parallel
context = 15  # Context window size
downsampling = 1e-3  # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec

print("Training model...")
model = word2vec.Word2Vec(
    sentences,
    workers=num_workers,
    vector_size=num_features,
    min_count=min_word_count,
    window=context,
    sample=downsampling,
)

# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "models/1000features_20minwords_15context"
model.save(model_name)

2024-10-04 08:50:26,630 : INFO : collecting all words and their counts
2024-10-04 08:50:26,630 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-04 08:50:26,656 : INFO : PROGRESS: at sentence #10000, processed 227101 words, keeping 18037 word types
2024-10-04 08:50:26,684 : INFO : PROGRESS: at sentence #20000, processed 454422 words, keeping 25321 word types
2024-10-04 08:50:26,715 : INFO : PROGRESS: at sentence #30000, processed 674816 words, keeping 30471 word types
2024-10-04 08:50:26,751 : INFO : PROGRESS: at sentence #40000, processed 902035 words, keeping 34850 word types
2024-10-04 08:50:26,787 : INFO : PROGRESS: at sentence #50000, processed 1122616 words, keeping 38319 word types
2024-10-04 08:50:26,825 : INFO : PROGRESS: at sentence #60000, processed 1345401 words, keeping 41326 word types


Training model...


2024-10-04 08:50:26,865 : INFO : PROGRESS: at sentence #70000, processed 1569458 words, keeping 43964 word types
2024-10-04 08:50:26,898 : INFO : PROGRESS: at sentence #80000, processed 1789870 words, keeping 46393 word types
2024-10-04 08:50:26,929 : INFO : PROGRESS: at sentence #90000, processed 2015433 words, keeping 48855 word types
2024-10-04 08:50:26,959 : INFO : PROGRESS: at sentence #100000, processed 2238378 words, keeping 50962 word types
2024-10-04 08:50:26,989 : INFO : PROGRESS: at sentence #110000, processed 2458629 words, keeping 52867 word types
2024-10-04 08:50:27,027 : INFO : PROGRESS: at sentence #120000, processed 2682003 words, keeping 54946 word types
2024-10-04 08:50:27,057 : INFO : PROGRESS: at sentence #130000, processed 2909250 words, keeping 56731 word types
2024-10-04 08:50:27,086 : INFO : PROGRESS: at sentence #140000, processed 3123021 words, keeping 58266 word types
2024-10-04 08:50:27,123 : INFO : PROGRESS: at sentence #150000, processed 3349974 words, ke

In [17]:
from gensim.models import Word2Vec
from concurrent.futures import ThreadPoolExecutor

model = Word2Vec.load("models/1000features_20minwords_15context")


def parallelize_reviews(reviews):
    with ThreadPoolExecutor() as executor:
        return list(executor.map(review_to_wordlist, reviews))

clean_train_reviews = parallelize_reviews(train["review"])
clean_test_reviews = parallelize_reviews(test["review"])


In [8]:
from sklearn.cluster import KMeans
import time

start = time.time()  # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.vectors
num_clusters = int(len(word_vectors) / 5)


# Initialize a k-means object and use it to extract centroids
kmeans_clustering = KMeans(n_clusters=num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

Time taken for K Means clustering:  52.82303833961487 seconds.


In [12]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number
word_centroid_map = dict(zip(model.wv.index_to_key, idx))

In [18]:
def create_bag_of_centroids(wordlist, word_centroid_map):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max(word_centroid_map.values()) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [19]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32")

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

# Repeat for test reviews
test_centroids = np.zeros((test["review"].size, num_clusters), dtype="float32")

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Fit a random forest and extract predictions
forest = RandomForestClassifier(n_estimators=100)

# Fitting the forest may take a few minutes
print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids, train["sentiment"])
result = forest.predict(test_centroids)

# Write the test results
output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
output.to_csv("submissions/BagOfCentroids3.csv", index=False, quoting=3)