In [8]:
import io
import re
import string
from tqdm import tqdm
import json
import numpy as np

from gensim.models import Word2Vec
import gensim.downloader as api

from tensorflow.keras.preprocessing.text import Tokenizer

In [21]:
### download a word2vec model
wv = api.load('word2vec-google-news-300')

# save the model
wv.save('word2vec-google-news-300.kv')

In [22]:
wv.save('word2vec-google-news-300.model')

In [12]:
paths = [
    'data/sarcasm_headlines/Sarcasm_Headlines_Dataset.json',
    'data/sarcasm_headlines/Sarcasm_Headlines_Dataset_v2.json'
]

class MySentences(object):
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in open(self.filename, 'r'):
            data = json.loads(line)
            headline = data['headline']
            # Basic preprocessing
            headline = re.sub(r'\W+', ' ', headline).lower().split()
            yield headline

# Initialize the generator
sentences = MySentences(paths[0])  # Path to your dataset

In [10]:
### load the datasets
# save the sentences in here
sentences = []

# the paths to the datasets
paths = [
    'data/sarcasm_headlines/Sarcasm_Headlines_Dataset.json',
    'data/sarcasm_headlines/Sarcasm_Headlines_Dataset_v2.json'
]

for path in paths:
    with open(path, 'r') as file:
        for line in file:
            # load the json
            json_line = json.loads(line)
            headline = json_line['headline']
            tokens = headline.split(' ')

            # only use sarcastic headlines
            # if not json_line['is_sarcastic']:
            #     continue
            
            # append the headline to the sentences
            sentences.append(tokens)

print(f'Loaded {len(sentences)} sentences')

Loaded 55328 sentences


In [13]:
# train the model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# Save the model
model.save("word2vec.model")

In [18]:
model = Word2Vec.load("word2vec.model")

# get all keys of the model
keys = list(model.wv.key_to_index.keys())

In [19]:
### similarity checker
N_REPETITIONS = 10

for i in range(N_REPETITIONS):
    # get two random words
    word1 = np.random.choice(keys)
    word2 = np.random.choice(keys)

    # print the similarity
    print(f'Similarity between {word1} and {word2}: {model.wv.similarity(word1, word2)}')

Similarity between platinum and spraying: 0.9082143902778625
Similarity between devour and conscious: 0.9656654000282288
Similarity between commits and availability: 0.32744503021240234
Similarity between creek and blankets: 0.9295600056648254
Similarity between inappropriate and prequel: 0.8712742924690247
Similarity between song and hairstyles: 0.9204217195510864
Similarity between bandmates and val: 0.91707843542099
Similarity between coats and shot: 0.9589670300483704
Similarity between darkness and adequately: 0.8801407217979431
Similarity between unveiled and arby: 0.8504656553268433


In [32]:
words = [
    'king', 'queen', 'man', 'woman', 'paris', 'berlin', 'car', 'bicycle', 'pizza', 'pasta',
    'dog', 'cat', 'apple', 'orange', 'happy', 'sad', 'city', 'village', 'mountain', 'river',
    'sun', 'moon', 'star', 'planet', 'ocean', 'lake', 'coffee', 'tea', 'book', 'newspaper',
    'rain', 'snow', 'summer', 'winter', 'morning', 'night', 'music', 'silence', 'computer', 'smartphone',
    'science', 'art', 'mathematics', 'literature', 'football', 'basketball', 'guitar', 'piano', 'eagle', 'sparrow',
    'rose', 'tulip', 'tree', 'grass', 'gold', 'silver', 'milk', 'water', 'butterfly', 'bee',
    'love', 'hate', 'peace', 'war', 'rich', 'poor', 'health', 'disease', 'strength', 'weakness',
    'magic', 'science', 'fantasy', 'reality', 'dream', 'nightmare', 'hero', 'villain', 'comedy', 'tragedy',
    'fire', 'ice', 'light', 'darkness', 'truth', 'lie', 'history', 'future', 'friend', 'enemy',
    'north', 'south', 'east', 'west', 'earth', 'mars', 'jupiter', 'saturn', 'universe', 'galaxy'
]

In [36]:
def print_result(word1, word2, word3, result):
    for r in result:
        print(f"Result of {word1} - {word3} + {word2}: {r[0]} ({r[1]})")

In [90]:
base = 'meow'
subtract = 'cat'
add = 'dog'

result = wv.most_similar(positive=[base, add], negative=[subtract], topn=1)
print_result(base, add, subtract, result)

Result of meow - cat + dog: woof_woof (0.5396372079849243)


In [33]:
N_REPETITIONS = 50

for i in range(N_REPETITIONS):

    word1 = word2 = word3 = None

    # get three different words
    while word1 == word2 or word1 == word3 or word2 == word3:
        # get three random words
        word1 = np.random.choice(words)
        word2 = np.random.choice(words)
        word3 = np.random.choice(words)
    
    result = wv.most_similar(positive=[word1, word2], negative=[word3], topn=1)
    print(f"Result of {word1} - {word3} + {word2}: {result[0][0]} ({result[0][1]})")

Result of science - car + west: east (0.51700758934021)
Result of truth - tea + night: tonight (0.4257323443889618)
Result of ocean - rain + dog: dogs (0.528801679611206)
Result of eagle - paris + nightmare: bogey (0.42562130093574524)
Result of water - tree + tulip: freshwater (0.4894673228263855)
Result of friend - poor + villain: pal (0.5515633225440979)
Result of king - disease + poor: kings (0.4762413501739502)
Result of moon - tree + sun: waning_gibbous (0.5550332069396973)
Result of winter - music + sad: winters (0.4633391499519348)
Result of paris - darkness + man: woman (0.44301486015319824)
Result of cat - mountain + hero: puppy (0.4813304543495178)
Result of fantasy - smartphone + rain: downpour (0.4814264476299286)
Result of health - happy + pizza: heath (0.5098319053649902)
Result of weakness - silver + planet: Earth (0.4377327263355255)
Result of friend - happy + bicycle: bike (0.5242830514907837)
Result of love - milk + paris: luv (0.4688273072242737)
Result of lake - sp

In [31]:
result = wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(f"Result of 'king' - 'man' + 'woman': {result[0][0]} ({result[0][1]})")

Result of 'king' - 'man' + 'woman': queen (0.7118193507194519)
