In [None]:
!pip install plotly --upgrade

# New Section

In [None]:
import logging
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
import lzma
import shutil
import re
import string
import emoji
import linecache
import plotly.graph_objs as go
import random

from gensim.models import Word2Vec, KeyedVectors
from gensim.utils import simple_preprocess
from sklearn.decomposition import PCA, KernelPCA, IncrementalPCA
from sklearn.manifold import TSNE

from plotly.offline import init_notebook_mode, iplot, plot

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
stemmer_factory = StemmerFactory()
stopword_factory = StopWordRemoverFactory()

stemmer = stemmer_factory.create_stemmer()
stopword_remover = stopword_factory.create_stop_word_remover()

workspace_dir = 'D:/Ricky/Coding/Python/skripsi_ricky/'
corpus_path = 'D:/dataset_all_uncased_blankline.txt'

In [None]:
os.chdir(workspace_dir)
cwd = os.getcwd()
cwd

'D:\\Ricky\\Coding\\Python\\skripsi_ricky'

In [None]:
def text_cleaning(line):
  lowercased = line.lower()
  exclusion_list = string.punctuation + string.digits 
  mapping_table = str.maketrans(exclusion_list, ' '*len(exclusion_list))
  number_punctuation_removed = ' '.join(lowercased.translate(mapping_table).split())
  # number_removed = lowercased.translate(str.maketrans(string.digits, ' '*len(string.digits)))
  # punctuation_removed = number_removed.translate(str.maketrans((string.punctuation), ' '*len(string.punctuation)))
  # emoji_removed = emoji.replace_emoji(punctuation_removed)
  emoji_removed = emoji.replace_emoji(number_punctuation_removed)
  url_removed = re.sub(r'http\S+', '', emoji_removed)
  white_space_removed = ' '.join(url_removed.split())
  stripped = white_space_removed.strip()
  return stripped

class Corpus(object):
  def __init__(self, filename):
    self.filename = filename
  
  def __iter__(self):
    for line in open(corpus_path, encoding='utf-8'):
      cleaned_text = text_cleaning(line)
      yield cleaned_text.split()

corpus = Corpus(corpus_path)

In [None]:
model = Word2Vec(corpus, vector_size=256, sg = 1, window = 5, workers=10)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2022-11-21 08:48:27,379 : INFO : EPOCH 4 - PROGRESS: at 50.73% examples, 238977 words/s, in_qsize 0, out_qsize 1
2022-11-21 08:48:28,628 : INFO : EPOCH 4 - PROGRESS: at 50.74% examples, 238964 words/s, in_qsize 11, out_qsize 0
2022-11-21 08:48:29,638 : INFO : EPOCH 4 - PROGRESS: at 50.76% examples, 238974 words/s, in_qsize 0, out_qsize 0
2022-11-21 08:48:30,672 : INFO : EPOCH 4 - PROGRESS: at 50.76% examples, 238966 words/s, in_qsize 0, out_qsize 0
2022-11-21 08:48:31,774 : INFO : EPOCH 4 - PROGRESS: at 50.77% examples, 238957 words/s, in_qsize 4, out_qsize 0
2022-11-21 08:48:32,780 : INFO : EPOCH 4 - PROGRESS: at 50.78% examples, 238948 words/s, in_qsize 15, out_qsize 0
2022-11-21 08:48:33,946 : INFO : EPOCH 4 - PROGRESS: at 50.78% examples, 238935 words/s, in_qsize 10, out_qsize 9
2022-11-21 08:48:34,951 : INFO : EPOCH 4 - PROGRESS: at 50.80% examples, 238949 words/s, in_qsize 0, out_qsize 0
2022-11-21 08:48:35,956 : IN

In [None]:
# model.save('D:/Ricky/Coding/Python/skripsi_ricky/model/word2vec_indo4b2.model')
model = Word2Vec.load('D:/Ricky/Coding/Python/skripsi_ricky/model/word2vec_indo4b2.model')

2022-11-21 11:12:05,212 : INFO : loading Word2Vec object from D:/Ricky/Coding/Python/skripsi_ricky/model/word2vec_indo4b2.model
2022-11-21 11:12:06,046 : INFO : loading wv recursively from D:/Ricky/Coding/Python/skripsi_ricky/model/word2vec_indo4b2.model.wv.* with mmap=None
2022-11-21 11:12:06,046 : INFO : loading vectors from D:/Ricky/Coding/Python/skripsi_ricky/model/word2vec_indo4b2.model.wv.vectors.npy with mmap=None
2022-11-21 11:12:08,089 : INFO : loading syn1neg from D:/Ricky/Coding/Python/skripsi_ricky/model/word2vec_indo4b2.model.syn1neg.npy with mmap=None
2022-11-21 11:12:10,427 : INFO : setting ignored attribute cum_table to None
2022-11-21 11:12:23,397 : INFO : Word2Vec lifecycle event {'fname': 'D:/Ricky/Coding/Python/skripsi_ricky/model/word2vec_indo4b2.model', 'datetime': '2022-11-21T11:12:23.397963', 'gensim': '4.2.0', 'python': '3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'loaded'}

In [None]:
total_index = len(model.wv.index_to_key)
for index, word in enumerate(model.wv.index_to_key):
  print(f"word #{index}/{total_index} is {word}")

In [None]:
sims = model.wv.most_similar('mengerikan', topn=20)  # get other similar words
sims

[('menyeramkan', 0.8226692080497742),
 ('menakutkan', 0.8147406578063965),
 ('menyedihkan', 0.7458503842353821),
 ('menjijikkan', 0.7258630990982056),
 ('mengerika', 0.7147645354270935),
 ('memilukan', 0.7010138630867004),
 ('menggelikan', 0.6809848546981812),
 ('menakjubkan', 0.6771619915962219),
 ('mengerikannya', 0.6762601733207703),
 ('aneh', 0.6739831566810608),
 ('memalukan', 0.6675752401351929),
 ('kejam', 0.6639183759689331),
 ('menggerikan', 0.6569114327430725),
 ('mengerihkan', 0.6458063125610352),
 ('tragis', 0.6438093185424805),
 ('menjijikan', 0.641643762588501),
 ('kengerian', 0.6390420198440552),
 ('memuakkan', 0.6389889121055603),
 ('konyol', 0.6365180611610413),
 ('meyeramkan', 0.6361666917800903)]

In [None]:
word_vectors = model.wv
word_vectors.save("D:/Ricky/Coding/Python/skripsi_ricky/model/skipgram_word2vec_indo4b2.wordvectors")

2022-11-21 11:19:09,402 : INFO : KeyedVectors lifecycle event {'fname_or_handle': 'D:/Ricky/Coding/Python/skripsi_ricky/model/skipgram_word2vec_indo4b2.wordvectors', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-11-21T11:19:09.402862', 'gensim': '4.2.0', 'python': '3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'saving'}
2022-11-21 11:19:09,402 : INFO : storing np array 'vectors' to D:/Ricky/Coding/Python/skripsi_ricky/model/skipgram_word2vec_indo4b2.wordvectors.vectors.npy
2022-11-21 11:19:12,712 : INFO : saved D:/Ricky/Coding/Python/skripsi_ricky/model/skipgram_word2vec_indo4b2.wordvectors


In [None]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



KeyboardInterrupt: ignored