In [1]:
import json, nltk, re
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import gensim.downloader as api
import nlpaug.augmenter.char as nac
from tqdm import tqdm

In [2]:
df = pd.read_csv('./clean_dataset.csv')
df['code'] = df['code'].apply(lambda code: code.replace(" ", ""))

In [13]:
def word_augmentation(df, n):
    aug = nac.KeyboardAug()
    aug_df = pd.DataFrame(columns=['input','target','code'])
    
    for i in tqdm(range(len(df))):
        words = df.iloc[i]
        for j in range(n):
            augmented_data = aug.augment(words["input"])
            aug_df = aug_df.append({ "input": augmented_data, "target": words["target"], "code": words["code"] }, ignore_index=True)
    
    return df.append(aug_df)

In [14]:
# Data augmentation
print(len(df))
df = word_augmentation(df, n=3)
print(len(df))

  0%|          | 0/360 [00:00<?, ?it/s]

360


100%|██████████| 360/360 [02:00<00:00,  2.99it/s]

1440





In [15]:
sentences = []
for index, row in tqdm(df.iterrows()):
    words = nltk.word_tokenize(row['input']) + nltk.word_tokenize(row['target']) + nltk.word_tokenize(row['code'])
    words = [re.sub("[^A-Za-z']+", ' ', str(word)).lower() for word in words]
    sentences.append(words)

1440it [00:00, 3426.14it/s]


### Create a Word2Vec vector space

In [16]:
# define the model
w2v = Word2Vec(size = 300, window=5, min_count = 1, workers = 2)
w2v.build_vocab(sentences)

# summarize the loaded model
# print(w2v)

# summarize vocabulary
# words = list(model.wv.vocab)
# print(words)

# save model
# w2v.save('w2v.bin')

In [17]:
w2v.intersect_word2vec_format(
    './GoogleNews-vectors-negative300.bin.gz',
    lockf=1.0,
    binary=True)

In [18]:
# train the model on the dataset
w2v.train(sentences,
          total_examples=w2v.corpus_count,
          epochs=300,
          report_delay=1)

(1578927, 1842000)

In [19]:
w2v.most_similar(positive=["brest"])

  """Entry point for launching an IPython kernel.


[('frbes', 0.9355872869491577),
 ('bgest', 0.9262894988059998),
 ('f ', 0.8307290077209473),
 ('brect', 0.8271465301513672),
 ('brezt', 0.8172892332077026),
 ('nrest', 0.7881510853767395),
 ('brext', 0.7849330902099609),
 ('bres ', 0.7788300514221191),
 ('brwst', 0.7677716016769409),
 ('francw', 0.755815863609314)]

In [21]:
# new_sentences = [["petersbourg","stpetersburg","petersburg"]]
new_sentences = [["petersbourg"]]
w2v.build_vocab(new_sentences, update=True)
w2v.train(new_sentences, total_examples=w2v.corpus_count, epochs=200)

(7, 200)

In [22]:
w2v.most_similar(positive=["petersbourg"])

  """Entry point for launching an IPython kernel.


[('rout', 0.08573538064956665),
 ('liver ooo', 0.07342198491096497),
 ('leter', 0.07035710662603378),
 ('lkgerpool', 0.0669819787144661),
 ('liverpool', 0.059646494686603546),
 ('grpir', 0.05801961570978165),
 (' ilhury', 0.05700842663645744),
 ('police', 0.05463198944926262),
 ('poloce', 0.05395359918475151),
 (' ilvury', 0.053714342415332794)]

In [27]:
df.iloc[:10]

Unnamed: 0,input,target,code
0,BREST,BREST,FRBES
1,RADE DE BREST,BREST,FRBES
2,LA TURBALLE,LA TURBALLE,FRTBE
3,DOUARNENEZ,DOUARNENEZ,FRDRZ
4,FELIXSTOWE,FELIXSTOWE,GBFXT
5,CONCARNEAU,CONCARNEAU,FRCOC
6,PORT SAID,PORT SAID,EGPSD
7,LE GUILVINEC,GUILVINEC,FRGVC
8,TANGIER,TANGIER,MATNG
9,SANTANDER,SANTANDER,ESSDR
