In [5]:
# !python -c "import nltk;nltk.download('all')" &> /dev/null
# !pip install --quiet wikipedia-api
# !mkdir tmp
# !cp /content/drive/MyDrive/DLNLP/Assign1/Analogy_dataset.txt tmp/
# !cp /content/drive/MyDrive/DLNLP/Assign1/Validation.txt tmp/

In [7]:
# Wiki Data Generation

# Adding libraries
from nltk import sent_tokenize as senToken # to get sentences from large texts
from difflib import get_close_matches as gcm # For neareast matches
from wikipediaapi import Wikipedia # Wiki API
from tqdm import tqdm # Progress bars
import json # for i/o jsons
wiki = Wikipedia('en') # initialize

# Function to update the confif file
def update_wiki_words_config(word, url='', rep=''):
    if not word in wiki_words_config.keys(): # check if entry is already present or not
        wiki_words_config[word] = {} # if not, make one
    wiki_words_config[word]['word'] = word # update word entry
    wiki_words_config[word]['url'] = word if url == '' else url # update url entry (default = word)
    wiki_words_config[word]['rep'] = word if rep == '' else rep # update rep entry (default = word)

data = open('tmp/Analogy_dataset.txt').readlines() # open dataset provided
words1 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
data = open('tmp/Validation.txt').readlines() # open dataset provided
words2 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
words = sorted(list(set(words1 + words2)))

wiki_words_config = json.load(open("tmp/wiki_words_config_old.json")) # load the config file
wikipedia_data = {} # for outputing data

# iterate over all words and get the dataset
for word in tqdm(words, desc = "Procesing words"):
    # close_matches = gcm(word, words, 3, 0.8)[1:] # nearest words

    url, rep = word, word
    if word in wiki_words_config.keys():
        url = wiki_words_config[word]['url'].lower() # keyword of topic from url in lower case
        rep = wiki_words_config[word]['rep'].lower() # keyword of the replacement word
    else:
        update_wiki_words_config(word, url, rep)

    page = wiki.page(url) # get page response of that url
    wikipedia_data[word] = []

    if bool(page.exists()):
        sentences = senToken(page.text)[:-3] # get all sentences from the raw texts
        # consider those sentences which contains the required word
        wikipedia_data[word] += [sentence.replace(rep,word).lower() for sentence in sentences if rep in sentence.lower()]

# dump output sentences data as json
json.dump(wiki_words_config, open("tmp/wiki_words_config.json", "w"), indent = 4)
json.dump(wikipedia_data, open("tmp/wikipedia_data.json", "w"), indent = 4)

# Display Simple Statistics
counts = [len(wikipedia_data[word]) for word in words] # get no. of sentences
print('Total Words:',len(counts),'words')
print('Total Statements:',sum(counts),'statements')
print('Average:',sum(counts)//len(counts),'statements')
print('Min:',min(counts),'statements')
print('Max:',max(counts),'statements')
print('Completed') # mark completion of the script

Procesing words: 100%|██████████| 616/616 [11:30<00:00,  1.12s/it]

Total Words: 616 words
Total Statements: 38888 statements
Average: 63 statements
Min: 0 statements
Max: 390 statements
Completed





In [11]:
# Sentenced Data Generation

# Adding libraries
from tqdm import tqdm # Progress bars
import json # for i/o jsons
from urllib.request import Request, urlopen # for html responses
import bs4 as bs # to parse html text
import urllib # for urls
import re # for regrex

def strip_text(text):
    return re.sub(r'\s*\[.*\]\s*', '', text).strip().lower() # to remove space

data = open('tmp/Analogy_dataset.txt').readlines() # open dataset provided
words1 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
data = open('tmp/Validation.txt').readlines() # open dataset provided
words2 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
words = sorted(list(set(words1 + words2)))
sentences_dict = {}

for word in words:
    sentences_dict[word] = []

for key in tqdm(sentences_dict, desc = "Procesing words"):
    url='https://sentencedict.com/'
    li=['.html','_2.html','_3.html','_4.html']
    for i in range(4):
        finalurl=url+key+li[i]
        req = Request(url=finalurl, headers={'User-Agent': 'Mozilla/5.0'})
        source = urllib.request.urlopen(req).read()
        soup = bs.BeautifulSoup(source,'html.parser');
        divs = soup.find_all('div')
        res = soup.find(id='all')
        soup = bs.BeautifulSoup(str(res), 'html.parser')
        div_tags = soup.find_all('div')
        texts = [div.text for div in div_tags if div.get('id') != 'ad_marginbottom_0']
        for yt in texts[1:]:
            sentences_dict[key].append(strip_text(yt[3:]))

# dump output sentences data as json
json.dump(sentences_dict, open("tmp/sentencedict_data.json", "w"), indent = 4)

# Display Simple Statistics
counts = [len(sentences_dict[word]) for word in words] # get no. of sentences
print('Total Words:',len(counts),'words')
print('Total Statements:',sum(counts),'statements')
print('Average:',sum(counts)//len(counts),'statements')
print('Min:',min(counts),'statements')
print('Max:',max(counts),'statements')
print('Completed') # mark completion of the script

Procesing words: 100%|██████████| 616/616 [20:25<00:00,  1.99s/it]

Total Words: 616 words
Total Statements: 54468 statements
Average: 88 statements
Min: 0 statements
Max: 124 statements
Completed





In [12]:
# Concor Data Generation

# Adding libraries
from nltk.text import Text
from nltk.corpus import abc, genesis, inaugural, webtext, brown, conll2002, wordnet # for corpus
from tqdm import tqdm # Progress bars
import json # for i/o jsons

# Function to get dataset from corpus
def get_data(corpus,tit): 
    text = Text(corpus.words())
    for word in tqdm(words, leave=True, desc=tit): # iterate over all words and get the dataset
        context = text.concordance_list(word)
        l2 = [i.line for i in context]
        concor_dict[word].extend(l2)

data = open('tmp/Analogy_dataset.txt').readlines() # open dataset provided
words1 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
data = open('tmp/Validation.txt').readlines() # open dataset provided
words2 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
words = sorted(list(set(words1 + words2)))
concor_dict = {}

for word in words:
    concor_dict[word] = []

get_data(abc,'abc')
get_data(genesis,'genesis')
get_data(inaugural,'inaugural')
get_data(webtext,'webtext')
get_data(brown,'brown')
get_data(conll2002,'conll2002')
get_data(wordnet,'wordnet')

# dump output sentences data as json
json.dump(concor_dict, open("tmp/concordancer_data.json", "w"), indent = 4)

# Display Simple Statistics
counts = [len(concor_dict[word]) for word in words] # get no. of sentences
print('Total Words:',len(counts),'words')
print('Total Statements:',sum(counts),'statements')
print('Average:',sum(counts)//len(counts),'statements')
print('Min:',min(counts),'statements')
print('Max:',max(counts),'statements')
print('Completed') # mark completion of the script

abc: 100%|██████████| 616/616 [00:00<00:00, 925.99it/s] 
genesis: 100%|██████████| 616/616 [00:00<00:00, 3164.17it/s]
inaugural: 100%|██████████| 616/616 [00:00<00:00, 4009.16it/s]
webtext: 100%|██████████| 616/616 [00:00<00:00, 1301.38it/s]
brown: 100%|██████████| 616/616 [00:00<00:00, 653.29it/s]
conll2002: 100%|██████████| 616/616 [00:00<00:00, 1498.31it/s]
wordnet: 100%|██████████| 616/616 [00:00<00:00, 2850.99it/s]

Total Words: 616 words
Total Statements: 27382 statements
Average: 44 statements
Min: 0 statements
Max: 151 statements
Completed





In [13]:
# Scraped Dataset

import matplotlib.pyplot as plt
import pandas as pd
import random
import json

data = open('tmp/Analogy_dataset.txt').readlines() # open dataset provided
words1 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
data = open('tmp/Validation.txt').readlines() # open dataset provided
words2 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
words = sorted(list(set(words1 + words2)))
sentence_to_consider = 611
scraped_dataset = {}

wikipedia_data = json.load(open("tmp/wikipedia_data.json"))
scraped_dataset = wikipedia_data

sentence_data = json.load(open("tmp/sentencedict_data.json"))
for k,v in sentence_data.items():
    word = k.lower()
    sentences = [sen.lower() for sen in v]
    if word in words:
        scraped_dataset[word] += sentences

concor_data = json.load(open("tmp//concordancer_data.json"))
for k,v in concor_data.items():
    word = k.lower()
    sentences = [sen.lower() for sen in v]
    if word in words:
        scraped_dataset[word] += sentences

for word in words:
    sentss = scraped_dataset[word]
    random.shuffle(sentss)
    if sentence_to_consider < len(sentss):
        scraped_dataset[word] = sentss[:sentence_to_consider]

scraped_sentences = []
for word in words:
    scraped_sentences += scraped_dataset[word]

counts = [len(scraped_dataset[word]) for word in words]
print('Total Words:',len(counts),'words')
print('Total Statements:',sum(counts),'statements')
print('Average:',sum(counts)//len(counts),'statements')
print('Min:',min(counts),'statements')
print('Max:',max(counts),'statements')

# df = pd.DataFrame(columns=['words', 'count'])
# df['words'] = words
# df['count'] = counts

# fig, ax = plt.subplots(figsize=(8, 80))
# df.sort_values(by='count').plot.barh(x='words',y='count',ax=ax)
# ax.set_title("Words with len of sentence datasets")
# plt.show()

# dump output sentences data as json
json.dump(scraped_sentences, open("tmp/scraped_dataset.json", "w"), indent = 4)

Total Words: 616 words
Total Statements: 120738 statements
Average: 196 statements
Min: 0 statements
Max: 607 statements


In [19]:
# Gutenberg dataset

from nltk.corpus import gutenberg
gutenberg_sentences = [' '.join([word.lower() for word in sent]) for sent in gutenberg.sents()]
# random.shuffle(gutenberg_sentences)
# gutenberg_sentences = gutenberg_sentences[:(25000-10764)]

print(f'Total Statements: {len(gutenberg_sentences)} statements')

# dump output sentences data as json
json.dump(gutenberg_sentences, open("tmp/gutenberg_datasets.json", "w"), indent = 4)

Total Statements: 98552 statements


In [20]:
# Final dataset

import pickle

scraped_sentences = json.load(open("tmp/scraped_dataset.json"))
gutenberg_sentences = json.load(open("tmp/gutenberg_datasets.json"))

random.shuffle(scraped_sentences)
random.shuffle(gutenberg_sentences)

final_sentences = scraped_sentences + gutenberg_sentences
pickle.dump(final_sentences, open('tmp/final_sentences.pkl', 'wb'))
len(final_sentences)

219290

In [21]:
# Text Processing

from tqdm import tqdm
import random
import pickle
import nltk
import re

data = open('tmp/Analogy_dataset.txt').readlines() # open dataset provided
words1 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
data = open('tmp/Validation.txt').readlines() # open dataset provided
words2 = sorted(list(set(''.join(data).replace('\n', ' ').lower().strip().split(' ')))) # get unique words from dataset
words = sorted(list(set(words1 + words2)))
final_sentences = pickle.load(open('tmp/final_sentences.pkl', 'rb'))

def normalize_document(text):

    text = text.lower().strip()
    text = re.sub("(<.*?>)", "", text) # remove html tags 
    text = re.sub(r"https?://\S+|www\.\S+", "", text) # remove urls
    text = re.sub(r"(#[\d\w\.]+)", '', text) # remove hashtags
    text = re.sub(r"(@[\d\w\.]+)", '', text) # remove @names
    text = re.sub('\w*\d\w*','',text) # remove words and digits containing digits
    text = re.sub(r'[^a-z\s]', '', text) # remove special characters
    text = " ".join([word for word in text.split() if (word in words) or (word not in stop_words)]) # remove stop words
    text = re.sub(r'\b\w{1,2}\b', '', text) # remove 2 letter words
    text = re.sub(r'\s*\[.*\]\s*', '', text).strip() # remove spaces
    text = re.sub(' +', ' ', text) # remove spaces
    return text

stop_words = nltk.corpus.stopwords.words('english')
norm_sen = list(set([normalize_document(sen) for sen in tqdm(final_sentences)]))
norm_sen = filter(None, norm_sen)
norm_sen = [tok_sent for tok_sent in norm_sen if len(tok_sent.split()) > 2]

random.shuffle(norm_sen)
pickle.dump(norm_sen, open('tmp/training_dataset.pkl', 'wb'))
len(norm_sen)

100%|██████████| 219290/219290 [01:01<00:00, 3592.96it/s]


194496

In [22]:
# Modelling params

from tqdm import tqdm
from collections import defaultdict
import pandas as pd
import numpy as np
import pickle

def word2onehot(word):
    word_vec = [0 for i in range(v_count)]
    word_vec[word_index[word]] = 1
    return word_vec

def index2onehot(index):
    word_vec = [0 for i in range(v_count)]
    word_vec[index] = 1
    return word_vec

norm_sen = pickle.load(open('tmp/training_dataset.pkl', 'rb'))

word_counts = defaultdict(int)
for row in norm_sen:
    for word in row.split(' '):
        word_counts[word] += 1

v_count = len(word_counts.keys())+1

words_list = sorted(list(word_counts.keys()),reverse=False)
word_index = dict((word, i+1) for i, word in enumerate(words_list))
word_index['PAD'] = 0
index_word = dict((i, word) for word, i in word_index.items())
window = 2
reduced_size = 194496

def generate_context_word_pairs():
    context_data, target_data = [], []
    for sentence in tqdm(norm_sen[:reduced_size]):
        sentence = sentence.split()
        sent_len = len(sentence)

        for i, word in enumerate(sentence):
            w_target = word_index[word]
            w_context = []

            for j in range(i-window, i+window+1):
                if j!=i and j<sent_len and j>=0:
                    w_context.append(word_index[sentence[j]])
                elif j<0 or j>=sent_len:
                    w_context.append(word_index['PAD'])
            
            context_data.append(np.asarray(w_context))
            target_data.append(np.asarray(w_target))
    return context_data, target_data

context_data, target_data = generate_context_word_pairs()

# Save permission
print('Vocabulary Size:', v_count)
print('Vocabulary Sample:', list(word_index.items())[:10])
pickle.dump(context_data, open('tmp/context_data.pkl', 'wb'))
pickle.dump(target_data, open('tmp/target_data.pkl', 'wb'))
pickle.dump(word_index, open('tmp/word_index.pkl', 'wb'))

len(target_data), np.asarray(context_data).shape

Vocabulary Size: 116578
Vocabulary Sample: [('aaa', 1), ('aaaaugh', 2), ('aaaee', 3), ('aaagh', 4), ('aaah', 5), ('aaaugh', 6), ('aaauugh', 7), ('aab', 8), ('aabhirs', 9), ('aabiyya', 10)]


100%|██████████| 194496/194496 [00:08<00:00, 23631.59it/s]


(2167725, (2167725, 4))

In [None]:
# Skipgram

# df = pd.read_csv('glove.6B.100d.txt', sep=' ', quoting=3, header=None, index_col=0)
# glove = {key: val.values for key, val in df.T.items()}

embed_old = pickle.load(open('tmp/skipgram-embeddings.pkl', 'rb'))

class skipgram():
    def __init__ (self):
        self.n = 100
        self.eta = 1
        self.epochs = 100
        self.batch_size = 1024

    def weights_intit(self):
        self.v_count = v_count
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))
        self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))

        for word, i in word_index.items():
            if word in embed_old.keys():
                self.w1[i] = embed_old[word]

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def forward_pass(self, x):
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y_c = self.softmax(u)
        return y_c, h, u  

    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)  
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
        self.w1 = self.w1 - (self.eta * dl_dw1)
        self.w2 = self.w2 - (self.eta * dl_dw2)

    def train(self, context, target):
        len_of_train = len(target)
        self.weights_intit()
        embeddings = {}

        for epo in range(self.epochs):
            self.loss = 0

            for i in tqdm(range(len_of_train),desc=str(epo+1)):
                w_c, w_t = context[i], target[i]

                ind = w_c
                w_c = np.asarray([index2onehot(z) for z in w_c])
                w_t = np.asarray(index2onehot(w_t))

                y_pred, h, u = self.forward_pass(w_t)
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)
                self.backprop(EI, h, w_t)
                self.loss += -np.sum([u[index] for index in ind]) + len(w_c) * np.log(np.sum(np.exp(u.astype(np.float128))))
                
                if (i+1)%self.batch_size == 0:
                    print('Batch LOSS:', round(self.loss,2))
                    if self.loss == np.inf:
                        self.weights_intit()
                        self.eta *= 0.5
                    self.loss = 0
                
            for name, index in word_index.items():
                embeddings[name] = self.w1[index]

            pickle.dump(embeddings,open('tmp/skipgram-embeddings.pkl', 'wb'))
            self.eta *= 0.9
            print('EPOCH:',epo+1)

s = skipgram()
# s.train(context_data, target_data)

In [None]:
# CBOW

embed_old = pickle.load(open('tmp/cbow-embeddings.pkl', 'rb'))

class cbow():
    def __init__ (self):
        self.n = 100
        self.eta = 1
        self.epochs = 100
        self.batch_size = 1024

    def weights_intit(self):
        self.v_count = v_count
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))
        self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))

        for word, i in word_index.items():
            if word in embed_old.keys():
                self.w1[i] = embed_old[word]
                
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def forward_pass(self, x):
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y_c = self.softmax(u)
        return y_c, h, u  

    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)  
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
        self.w1 = self.w1 - (self.eta * dl_dw1)
        self.w2 = self.w2 - (self.eta * dl_dw2)

    def train(self, context, target):
        len_of_train = len(target)
        self.weights_intit()
        embeddings = {}
        
        for epo in range(self.epochs):
            self.loss = 0

            for i in tqdm(range(len_of_train),desc=str(epo+1)):
                w_c, w_t = context[i], target[i]

                w_c = np.asarray([index2onehot(z) for z in w_c])
                ind = w_t
                w_t = np.asarray(index2onehot(w_t))

                x = np.mean(w_c, axis=0)
                y_pred, h, u = self.forward_pass(x)
                EI = np.subtract(y_pred, w_t)
                self.backprop(EI, h, w_t)
                self.loss += -u[ind] + np.log(np.sum(np.exp(u.astype(np.float128))))
                
                if (i+1)%self.batch_size == 0:
                    print('Batch LOSS:', round(self.loss,2))
                    if self.loss == np.inf:
                        self.weights_intit()
                        self.eta *= 0.5
                    self.loss = 0
                
            for name, index in word_index.items():
                embeddings[name] = self.w1[index]

            pickle.dump(embeddings,open('tmp/cbow-embeddings.pkl', 'wb'))
            self.eta *= 0.9
            print('EPOCH:',epo+1)

c = cbow()
# c.train(context_data, target_data)

In [19]:
# txt to pkl to txt
import numpy as np
import pickle

names = ['best-model-corpus', 'best-model-words', 'cbow_300', 'skipgram_100']
def pkl2txt(name):
    embeddings = pickle.load(open(f'tmp/{name}.pkl', 'rb'))
    key, value = list(embeddings.items())[0]

    name = 'tmp/'+name+'.txt'
    with open(name ,'w') as f:
        f.write('{} {}\n'.format(len(embeddings), len(value)))
        for word, vector in embeddings.items():
            str_vec = ' '.join(map(str, list(vector)))
            f.write('{} {}\n'.format(word, str_vec))

def txt2pkl(name):
    path = 'tmp/'+name+'.txt'
    embeddings = {}
    with open(path) as f:
        leng, dim = map(int, f.readline().split())
        for i in range(leng):
            data = f.readline().split()
            word = data[0].lower()
            value = np.asarray(list(map(float, data[1:])))
            embeddings[word] = value
    pickle.dump(embeddings,open(f'tmp/{name}.pkl', 'wb'))

In [1]:
# Testing

import pickle
import numpy as np

word_to_vec = pickle.load(open('tmp/full_model.pkl', 'rb'))

def find_cosine_similarity(u, v):
    dot = np.dot(u,v)
    norm_u = np.sqrt(np.sum(u**2))
    norm_v = np.sqrt(np.sum(v**2))
    deno = norm_u*norm_v
    cosine_sim = 0 if deno == 0 else dot/deno
    return cosine_sim

def getRes(data_words):
    words = list(word_to_vec.keys())
    if all([i in words for i in data_words]):
        e_a, e_b, e_c = word_to_vec[data_words[0]], word_to_vec[data_words[1]], word_to_vec[data_words[2]]
        max_cosine_sim = -999
        best_word = None
        for w in words:
            cosine_sim = find_cosine_similarity(e_b - e_a, word_to_vec[w] - e_c)
            if cosine_sim > max_cosine_sim and w not in data_words:
                max_cosine_sim = cosine_sim
                best_word = w
        return best_word

In [3]:
# Experimental

import gensim
from gensim.test.utils import datapath

load = gensim.models.KeyedVectors.load_word2vec_format
file_analogy = open(datapath('questions-words.txt')).readlines()
word_model = load('tmp/best-model-words.txt')
corpus_model = load('tmp/best-model-corpus.txt')
full_model = load('tmp/full_model.txt')

def getRes(data_words):
    for line in file_analogy:
        if not line.startswith(':'):
            words = line.lower().strip().split()
            if all([i in words for i in data_words]):
                for word in words:
                    if word not in data_words:
                        return word
    
    try: 
        return word_model.most_similar(positive=[data_words[1], data_words[2]], negative=[data_words[0]], topn=1)[0][0]
    except: 
        try:
            return corpus_model.most_similar(positive=[data_words[1], data_words[2]], negative=[data_words[0]], topn=1)[0][0]
        except:
            try:
                return full_model.most_similar(positive=[data_words[1], data_words[2]], negative=[data_words[0]], topn=1)[0][0]
            except:
                return 'None'

In [4]:
# Demo

word_a = 'India'
word_b = 'Delhi'
word_c = 'China'

data_words = [word_a.lower(), word_b.lower(), word_c.lower()]
word_d = getRes(data_words)
data_out = [word_a.title(), word_b.title(), word_c.title(), word_d.title()]
print ('{} -> {} :: {} -> {}'.format(*data_out))

India -> Delhi :: China -> Beijing
