# Library

In [None]:
!pip install gensim



In [None]:
import pandas as pd
import gensim
from gensim.models import word2vec
import time
import multiprocessing
from datetime import timedelta

# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataset/simpsons_script_lines.csv')
df.shape

(158271, 2)

In [None]:
df.head()

Unnamed: 0,raw_character_text,normalized_text
0,Miss Hoover,no actually it was a little of both sometimes ...
1,Lisa Simpson,wheres mr bergstrom
2,Miss Hoover,i dont know although id sure like to talk to h...
3,Lisa Simpson,that life is worth living
4,Edna Krabappel-Flanders,the polls will be open from now until the end ...


# Data Cleaning

In [None]:
df.isnull().sum()

raw_character_text    17522
normalized_text       26184
dtype: int64

In [None]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
normalized_text       0
dtype: int64

# Membuat corpus


In [None]:
corpus_text = "\n".join(df['normalized_text'])

# Menyimpan corpus ke dalam file 'corpus.txt'
corpus_path = 'corpus.txt'
with open(corpus_path, 'w') as f:
    f.write(corpus_text)

# Training Model

In [None]:
start_time = time.time()
print('Training Word2Vec Model...')
sentences = word2vec.LineSentence(corpus_path)
w2v_model = word2vec.Word2Vec(sentences, vector_size=300, workers=multiprocessing.cpu_count())
w2v_model.save('model_word2vec_300_model')
finish_time = time.time()

Training Word2Vec Model...


# Test

In [None]:
w2v_model.wv.similarity('woman','man')

0.759478

In [None]:
w2v_model.wv.most_similar('chicken')

[('bread', 0.8689873218536377),
 ('potato', 0.8666888475418091),
 ('coconut', 0.8649487495422363),
 ('hairless', 0.8649431467056274),
 ('strupo', 0.864066481590271),
 ('shrimp', 0.8630320429801941),
 ('expert', 0.8626425862312317),
 ('peanut', 0.8605995178222656),
 ('angels', 0.8568980097770691),
 ('maker', 0.8567236661911011)]

In [None]:
w2v_model.wv.most_similar(positive=['homer'])

[('bart', 0.8418713212013245),
 ('marge', 0.8109707832336426),
 ('lisa', 0.792163074016571),
 ('abe', 0.7372915148735046),
 ('grampa', 0.6845441460609436),
 ('mrs', 0.6533976793289185),
 ('milhouse', 0.6465161442756653),
 ('son', 0.646304726600647),
 ('mr', 0.6280929446220398),
 ('moe', 0.617709755897522)]

In [None]:
w2v_model.wv.most_similar(positive=['marge'])

[('lisa', 0.8247048258781433),
 ('homer', 0.8109707832336426),
 ('bart', 0.7789226174354553),
 ('honey', 0.7306946516036987),
 ('son', 0.7018356919288635),
 ('homie', 0.7016165256500244),
 ('dad', 0.6785864233970642),
 ('maggie', 0.6664016246795654),
 ('milhouse', 0.6655166149139404),
 ('moe', 0.6501174569129944)]

In [None]:
w2v_model.wv.similarity('maggie', 'baby')

0.6033324

In [None]:
w2v_model.wv.most_similar(positive=['woman', 'homer'], negative=['marge'], topn=3)

[('man', 0.7022916674613953),
 ('bear', 0.6674923300743103),
 ('person', 0.639843225479126)]

In [None]:
w2v_model.wv.most_similar(positive=['woman','king'], negative=['homer'])

[('famous', 0.7310924530029297),
 ('american', 0.7130327820777893),
 ('mans', 0.7128820419311523),
 ('star', 0.7061775326728821),
 ('evil', 0.6927492022514343),
 ('tale', 0.6865885853767395),
 ('role', 0.6826967597007751),
 ('mystery', 0.6813787817955017),
 ('liberty', 0.6703658103942871),
 ('average', 0.6698994636535645)]

In [None]:
w2v_model.wv.doesnt_match('homer drink milk sleep angry'.split())

'angry'