In [1]:
import pandas as pd
import numpy as np
from time import time
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
news_df = pd.read_csv('bing_news_preprocessed.csv')

In [3]:
news_df.isnull().sum()

name           0
description    0
dtype: int64

In [4]:
# Define the path to the input CSV file
input_file = 'bing_news_preprocessed.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(input_file)

# Tokenize the text columns
data['name'] = data['name'].apply(
    lambda x: [word_tokenize(sentence) for sentence in sent_tokenize(x)])
data['description'] = data['description'].apply(
    lambda x: [word_tokenize(sentence) for sentence in sent_tokenize(x)])

# Flatten the list of sentences into a single list of tokenized words
data['name'] = data['name'].sum()
data['description'] = data['description'].sum()


result = data['name'] + data['description']
# Print the tokenized data
print(result[:3])

0    [modi, u, elon, musk, say, tesla, come, india,...
1    [musk, say, tesla, considering, india, investm...
2    [tesla, secret, configuration, allows, select,...
dtype: object


In [5]:
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [6]:
cores = multiprocessing.cpu_count()

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=500,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

2023-06-21 18:35:42,211 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=500, alpha=0.03>', 'datetime': '2023-06-21T18:35:42.211855', 'gensim': '4.3.1', 'python': '3.10.11 (main, May 17 2023, 14:30:36) [Clang 14.0.6 ]', 'platform': 'macOS-13.4-arm64-arm-64bit', 'event': 'created'}


In [7]:
t = time()

w2v_model.build_vocab(result, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

2023-06-21 18:35:42,216 : INFO : collecting all words and their counts
2023-06-21 18:35:42,216 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-06-21 18:35:42,249 : INFO : PROGRESS: at sentence #10000, processed 289013 words, keeping 2509 word types
2023-06-21 18:35:42,283 : INFO : PROGRESS: at sentence #20000, processed 586694 words, keeping 2509 word types
2023-06-21 18:35:42,317 : INFO : PROGRESS: at sentence #30000, processed 884446 words, keeping 2509 word types
2023-06-21 18:35:42,352 : INFO : PROGRESS: at sentence #40000, processed 1182083 words, keeping 2510 word types
2023-06-21 18:35:42,358 : INFO : collected 2510 word types from a corpus of 1229838 raw words and 41604 sentences
2023-06-21 18:35:42,358 : INFO : Creating a fresh vocabulary
2023-06-21 18:35:42,361 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 1589 unique words (63.31% of original 2510, drops 921)', 'datetime': '2023-06-21T18:35:42.361947', 'gensim': '4.

Time to build vocab: 0.0 mins


In [8]:
t  = time()

w2v_model.train(result, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

2023-06-21 18:35:42,383 : INFO : Word2Vec lifecycle event {'msg': 'training model with 7 workers on 1589 vocabulary and 500 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-06-21T18:35:42.383201', 'gensim': '4.3.1', 'python': '3.10.11 (main, May 17 2023, 14:30:36) [Clang 14.0.6 ]', 'platform': 'macOS-13.4-arm64-arm-64bit', 'event': 'train'}
2023-06-21 18:35:42,999 : INFO : EPOCH 0: training on 1229838 raw words (354735 effective words) took 0.6s, 579643 effective words/s
2023-06-21 18:35:43,439 : INFO : EPOCH 1: training on 1229838 raw words (354357 effective words) took 0.4s, 812752 effective words/s
2023-06-21 18:35:43,865 : INFO : EPOCH 2: training on 1229838 raw words (354870 effective words) took 0.4s, 843285 effective words/s
2023-06-21 18:35:44,266 : INFO : EPOCH 3: training on 1229838 raw words (354297 effective words) took 0.4s, 893178 effective words/s
2023-06-21 18:35:44,706 : INFO : EPOCH 4: training on 1229838 raw words (3

Time to train the model: 0.2 mins


In [9]:
w2v_model.wv.save_word2vec_format('tesla_news_w2v')  # 모델 저장


2023-06-21 18:35:54,106 : INFO : storing 1589x500 projection weights into tesla_news_w2v


In [10]:
w2v_model.wv.most_similar(positive=["tesla"])


[('motor', 0.20390723645687103),
 ('ap', 0.18950140476226807),
 ('vehicle', 0.18415725231170654),
 ('electric', 0.17801252007484436),
 ('gm', 0.17209164798259735),
 ('owner', 0.16553319990634918),
 ('ford', 0.16414546966552734),
 ('detroit', 0.16003084182739258),
 ('general', 0.14704257249832153),
 ('far', 0.14576536417007446)]

In [11]:
w2v_model.wv.most_similar(positive=["gigafactory"])


[('journal', 0.6870701909065247),
 ('interview', 0.620179295539856),
 ('clip', 0.5673475861549377),
 ('sixth', 0.5604254603385925),
 ('seem', 0.5484067797660828),
 ('street', 0.5451411008834839),
 ('shared', 0.5358864665031433),
 ('wall', 0.5186137557029724),
 ('casting', 0.5008113980293274),
 ('meant', 0.48182642459869385)]

In [14]:
w2v_model.wv.most_similar(positive=["autopilot"])


[('crash', 0.6024054288864136),
 ('escape', 0.5572729706764221),
 ('involved', 0.5564514398574829),
 ('mode', 0.5528668761253357),
 ('defense', 0.5435383319854736),
 ('claim', 0.536834180355072),
 ('whopping', 0.4865502119064331),
 ('wildfire', 0.4781320095062256),
 ('washington', 0.44768744707107544),
 ('bare', 0.44048401713371277)]