<a href="https://colab.research.google.com/github/Daniel-Yao-Chengdu/NLP-project/blob/master/Load_static_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pretrained vectors here
* https://developer.syn.co.in/tutorial/bot/oscova/pretrained-vectors.html
* we can download different vectors from the link above

# Fasttext embedding

In [75]:
# download fasttext pretrained embeddings
import os
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

'wget' 不是内部或外部命令，也不是可运行的程序
或批处理文件。
'unzip' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


In [2]:
# Create our own vocab
!pip install NLTK
import nltk
nltk.download('punkt')
sentence = "I love you, but also love NLP incredibly"

tokens = nltk.word_tokenize(sentence)
vocab=set(tokens)
vocab=list(vocab)





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrya\AppData\Roaming\nltk_data...
[nltk_data] Error downloading 'punkt' from
[nltk_data]     <https://raw.githubusercontent.com/nltk/nltk_data/gh-
[nltk_data]     pages/packages/tokenizers/punkt.zip>:   <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrya\AppData\Roaming\nltk_data...




[nltk_data]   Unzipping tokenizers\punkt.zip.


In [None]:
#Create a dictionary of embeddings. Here we only create the embedding dic that appear in our own vocab. 
#We can also create an embedding dict that involves all the original words.
import torch
fname="fastText/crawl-300d-2M.vec"
fin=open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
embedding_dic={}
from tqdm import tqdm_notebook

for line in tqdm_notebook(fin):
  tokens=line.rstrip().split(' ')
  if tokens[0] in vocab: 
    embedding_dic[tokens[0]]=torch.tensor(list(map(float, tokens[1:]))).unsqueeze(0)

In [None]:
embedding_dic

# Word2vec

In [None]:
!we need to download the GoogleNews-vectors-negative300.bin file from the website above, and then upload it into colab. 
!gunzip GoogleNews-vectors-negative300.bin

In [None]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format ('GoogleNews-vectors-negative300.bin', binary=True)
# if you vector file is in binary format, change to binary=True
sentence = ["London", "is", "the", "capital", "of", "Great", "Britain"]
vectors = [model[w] for w in sentence]

# Glove
* This zip file contains 4 files for 4 embedding representations.
* After unzipping the downloaded file we find four txt files: glove.6B.50d.txt, glove.6B.100d.txt, glove.6B.200d.txt, and glove.6B.300d.txt. As their filenames suggests, they have vectors with different dimensions.

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [None]:
import numpy as np
embedding_dic = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_dic[word] = coefs
f.close()

In [None]:
embedding_dic['love']

# Load pretrained vectors to a torch nn.Embedding
* refer to the NLP course HW1 for detailed explanation.

In [70]:
# read vectors from Glove file: the vectors and indexes are passed into another object
def read_word_embeddings(embeddings_file: str):
    """
    Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding
    that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized
    word embedding files.
    :param embeddings_file: path to the file containing embeddings
    :return: WordEmbeddings object reflecting the words and their embeddings
    """
    f = open(embeddings_file)
    word_indexer = Indexer()
    vectors = []
    # Make position 0 a PAD token, which can be useful if you
    word_indexer.add_and_get_index("PAD")
    # Make position 1 the UNK token
    word_indexer.add_and_get_index("UNK")
    for line in f:
        if line.strip() != "":
            space_idx = line.find(' ')
            word = line[:space_idx]
            numbers = line[space_idx+1:]
            float_numbers = [float(number_str) for number_str in numbers.split()]
            vector = np.array(float_numbers)
            word_indexer.add_and_get_index(word)
            # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line
            # of the file to see what the embedding dim is
            if len(vectors) == 0:
                vectors.append(np.zeros(vector.shape[0]))
                vectors.append(np.zeros(vector.shape[0]))
            vectors.append(vector)
    f.close()
    print("Read in " + repr(len(word_indexer)) + " vectors of size " + repr(vectors[0].shape[0]))
    # Turn vectors into a 2-D numpy array
    return WordEmbeddings(word_indexer, np.array(vectors))

In [76]:
# define auxiliary function: the indexer
class Indexer(object):
    """
    Bijection between objects and integers starting at 0. Useful for mapping
    labels, features, etc. into coordinates of a vector space.

    Attributes:
        objs_to_ints
        ints_to_objs
    """
    def __init__(self):
        self.objs_to_ints = {}
        self.ints_to_objs = {}

    def __repr__(self):
        return str([str(self.get_object(i)) for i in range(0, len(self))])

    def __str__(self):
        return self.__repr__()

    def __len__(self):
        return len(self.objs_to_ints)

    def get_object(self, index):
        """
        :param index: integer index to look up
        :return: Returns the object corresponding to the particular index or None if not found
        """
        if (index not in self.ints_to_objs):
            return None
        else:
            return self.ints_to_objs[index]

    def contains(self, object):
        """
        :param object: object to look up
        :return: Returns True if it is in the Indexer, False otherwise
        """
        return self.index_of(object) != -1

    def index_of(self, object):
        """
        :param object: object to look up
        :return: Returns -1 if the object isn't present, index otherwise
        """
        if (object not in self.objs_to_ints):
            return -1
        else:
            return self.objs_to_ints[object]

    def add_and_get_index(self, object, add=True):
        """
        Adds the object to the index if it isn't present, always returns a nonnegative index
        :param object: object to look up or add
        :param add: True by default, False if we shouldn't add the object. If False, equivalent to index_of.
        :return: The index of the object
        """
        if not add:
            return self.index_of(object)
        if (object not in self.objs_to_ints):
            new_idx = len(self.objs_to_ints)
            self.objs_to_ints[object] = new_idx
            self.ints_to_objs[new_idx] = object
        return self.objs_to_ints[object]

# define auxiliary function: the WordEmbeddings object: which can look up the index of a word and look up the embedding of a word
class WordEmbeddings:
    """
    Wraps an Indexer and a list of 1-D numpy arrays where each position in the list is the vector for the corresponding
    word in the indexer. The 0 vector is returned if an unknown word is queried.
    """
    def __init__(self, word_indexer, vectors):
        self.word_indexer = word_indexer
        self.vectors = vectors

    def get_initialized_embedding_layer(self):
        return torch.nn.Embedding.from_pretrained(torch.FloatTensor(self.vectors))

    def get_embedding_length(self):
        return len(self.vectors[0])

    def get_embedding(self, word):
        """
        Returns the embedding for a given word
        :param word: The word to look up
        :return: The UNK vector if the word is not in the Indexer or the vector otherwise
        """
        word_idx = self.word_indexer.index_of(word)
        if word_idx != -1:
            return self.vectors[word_idx]
        else:
            return self.vectors[self.word_indexer.index_of("UNK")]

In [None]:
# examples
word_embeddings=read_word_embeddings('path') # read embedding
word_embeddings.get_embedding('you')
word_embeddings.word_indexer.index_of('you')
embedding_layer=torch.nn.Embedding.from_pretrained(vectors) #load pre-trained embedding layer, the vectors must be torch.tensor type.