# Using Word2Vec for creating clusters for text data.

In [23]:
## Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import os
import re
import warnings
warnings.filterwarnings("ignore")
os.chdir("C:/Users/p_adi/OneDrive/Desktop/")
import random
import string
import nltk
from gensim.models import Word2Vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")
nltk.download('punkt')

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\p_adi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\p_adi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
train_df = pd.read_excel("new_data1.xlsx")
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home DÃ©cor/Home DÃ©cor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [25]:
train_df1 = train_df[['shipping','item_description']]
train_df1 = train_df1.iloc[0:10000,]
train_df1.head()

Unnamed: 0,shipping,item_description
0,1,No description yet
1,0,This keyboard is in great condition and works ...
2,1,Adorable top with a hint of lace and a key hol...
3,1,New with tags. Leather horses. Retail for [rm]...
4,0,Complete with certificate of authenticity


In [26]:
## Data Preprocessing

def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [27]:
## Randomly picked stopwords for data cleaning

s_words = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and',
              'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
              'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'com', 'could', "couldn't", 'did',
              "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'else', 'ever',
              'few', 'for', 'from', 'further', 'get', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having',
              'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how',
              "how's", 'however', 'http', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
              "it's", 'its', 'itself', 'just', 'k', "let's", 'like', 'me', 'more', 'most', "mustn't", 'my', 'myself',
              'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'otherwise', 'ought', 'our', 'ours',
              'ourselves', 'out', 'over', 'own', 'r', 'same', 'shall', "shan't", 'she', "she'd", "she'll", "she's",
              'should', "shouldn't", 'since', 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs',
              'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
              'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't",
              'www', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves','something',
              'biden',"https",'about','obama'," ' ", 'today','years','history','trumptards','trump','nsauagv','president',
             'everyone','someone','somebody','gonna','wanna','nprobably','calling','vsdvx','first','right','early','think',
             'donald','applicati','country','people','watch','start','still','dance','thank','pennsylvania',
             'applications','doesn','coming','going','hillary','getting','nblame','government','aaaaaarrrrrrrrrggggghhhh',
              'aaand','aable','aaderwdjcw','aaron','aasohahvzm','aatmjzokf','aaufysks','aavyeeq','abandon','abandoned',
              'abeyance','abiding','abilities','ability','abisv','abjrnaobrl','ablaz','ablcqv','ableist','abmst',
              'abnormal','abolish','aborted','aborti','aborting','media','gkzmbxwljn','glffq','ntarifyg','gkrqf',
              'gksaxfrsrv','gktmbs','glcrknprol','applica','rudygiuliani','realdonaldtrump','steveschmidtses',
              'projectlincoln','guttenberg','unroll','called','mentioned','office','marine','party','anything','thing',
              'never','jbyqgrdyt','kvmj','ucvx','nnsf','kvcvhmjxur','jgegxuj','gumgbvxxa','tupoqzxsfb','yqomrt','will',
             'oann','hkrassenstein','trying','cthe','made','gtconway','zzzzzzzzzzzzz','ryannmcenany','ghiorth','giannocaldwell'
             ,'giancarloc','ghtsfkcidm','brianclowdus','senatemajldr','mcconnell','caslernoel','zzjkevi','california','vxmeiljljf'
              ,'youve','gflf','gina','mariabartiromo','chuckgrassley','zzfjzcfk','gigagrouch','itsjefftiedrich','covid',
             'coylvukyhg','tachaprays','josprotests','josprotes','naija','endinsecurity','queernigerianlivesmatter','endsars'
             'gcmfr','merisanjeevni','agrasen','rhviypiswo','arup','silchar','selombang','gbvyexasah','australianfederalgovernment',
              'ausvotes','noahlindquist','singerinkansas','lindquist','jacinda','couvrefeu','drericding','dlqsqmjyln',
              'lmkdasjhqw','ckyor','gdvqnhagee','gcmaf','gdzie','gcpmfoukpt','gcpiuhcd','etwfo','durgapuja','gbvyexasah',
              'gbxsbkokwi','gbvfy','qmart']


In [28]:
custom_stopwords = set(stopwords.words("english") + s_words)
text_columns = ["item_description"]

df = train_df1.copy()
df["item_description"] = df["item_description"].fillna("")

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

docs = df["text"].values
tokenized_docs = df["tokens"].values

print(f"Original dataframe: {train_df1.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (60614, 2)
Pre-processed dataframe: (53846, 2)


In [29]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=3, seed=SEED)

In [36]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(53846, 100)

In [37]:

def mbkmeans_clusters(X, k, mb, print_silhouette_values,):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [38]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=10, mb=500, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 10
Silhouette coefficient: 0.09
Inertia:423080.8666753484
Silhouette values:
    Cluster 8: Size:1887 | Avg:0.14 | Min:-0.03 | Max: 0.35
    Cluster 1: Size:5417 | Avg:0.12 | Min:-0.05 | Max: 0.31
    Cluster 2: Size:8061 | Avg:0.12 | Min:0.01 | Max: 0.29
    Cluster 7: Size:5910 | Avg:0.11 | Min:-0.05 | Max: 0.32
    Cluster 4: Size:6149 | Avg:0.09 | Min:-0.04 | Max: 0.28
    Cluster 5: Size:7043 | Avg:0.07 | Min:-0.10 | Max: 0.29
    Cluster 9: Size:6156 | Avg:0.06 | Min:-0.06 | Max: 0.22
    Cluster 0: Size:4822 | Avg:0.06 | Min:-0.07 | Max: 0.23
    Cluster 6: Size:4705 | Avg:0.05 | Min:-0.07 | Max: 0.25
    Cluster 3: Size:3696 | Avg:0.05 | Min:-0.12 | Max: 0.28


In [41]:
print("Most representative terms per cluster (based on centroids):")
for i in range(10):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: striped plaid stripe maroon sweater 
Cluster 1: hoola paaarty bareminerals foundations younique 
Cluster 2: jewels jeweled tulle rows stacked 
Cluster 3: deadstock newnever thiss yeah og 
Cluster 4: disk disks dslr functionality paperwork 
Cluster 5: preowned wrinkled preloved flaws piling 
Cluster 6: juniors xxl oversized xsmall runs 
Cluster 7: lowballs pls stated offered resale 
Cluster 8: pet freepet wrinkly wornwashed worthington 
Cluster 9: xsmall med xhilaration nwot xss 


In [42]:
## Most representative cluster in a particular cluster
test_cluster = 1
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:3]:
    print(docs[d])
    print("-------------")

BNIB, never used Color: Medium Includes: - deluxe deep dive cleansing gel - deluxe Rainforest of the Seaâ„¢ 4-in-1 setting mist - deluxe drink of H2O hydrating boost moisturizer - deluxe BB tinted treatment primer SPF 30 in Light - deluxe lights, camera, lashesâ„¢ 4-in-1 mascara - deluxe Amazonian clay 12-hour blush in thrilled (dusty mauve) SOLD OUT ONLINE âœˆï¸FREE SHIPPING â€¼ï¸Firm price â­ï¸Bundle & save Thanks for lookingâ—ï¸
-------------
12 beauty items plus metallic gold cosmetics bag. All products are new and unused. Combination of sample size and full size. Broad range of mid-high end brands. Includes: Trust Fund Beauty nail polish ([rm] latte), NYX lip butter (lifeguard), GRLPWR liquid lipstick, TheBalm Bahama Mama bronzer, Lancome Bi-facial, Tokyomilk Dark perfume, Clean perfume (cotton t-shirt), Kerastase Cristal hair mask, Sephora waterproof eye makeup remover, Sephora cheek ink gel, and CoverFX custom enhancer drops. Plus i'll throw in some extra perfume samples. 