In [None]:
import os
import logging
import requests
import math
import zipfile
from contextlib import contextmanager
from tempfile import TemporaryDirectory
from tqdm import tqdm

log = logging.getLogger(__name__)


def maybe_download(url, filename=None, work_directory=".", expected_bytes=None):
    """Download a file if it is not already downloaded.
    Args:
        filename (str): File name.
        work_directory (str): Working directory.
        url (str): URL of the file to download.
        expected_bytes (int): Expected file size in bytes.
        
    Returns:
        str: File path of the file downloaded.
    """
    if filename is None:
        filename = url.split("/")[-1]
    os.makedirs(work_directory, exist_ok=True)
    filepath = os.path.join(work_directory, filename)
    if not os.path.exists(filepath):

        r = requests.get(url, stream=True)
        total_size = int(r.headers.get("content-length", 0))
        block_size = 1024
        num_iterables = math.ceil(total_size / block_size)

        with open(filepath, "wb") as file:
            for data in tqdm(
                r.iter_content(block_size),
                total=num_iterables,
                unit="KB",
                unit_scale=True,
            ):
                file.write(data)
    else:
        log.info("File {} already downloaded".format(filepath))
    if expected_bytes is not None:
        statinfo = os.stat(filepath)
        if statinfo.st_size != expected_bytes:
            os.remove(filepath)
            raise IOError("Failed to verify {}".format(filepath))

    return filepath



def download_path(path=None):
    """Return a path to download data. If `path=None`, then it yields a temporal path that is eventually deleted, 
    otherwise the real path of the input. 
    Args:
        path (str): Path to download data.
    Returns:
        str: Real path where the data is stored.
    Examples:
        >>> with download_path() as path:
        >>> ... maybe_download(url="http://example.com/file.zip", work_directory=path)
    """
    if path is None:
        tmp_dir = TemporaryDirectory()
        try:
            yield tmp_dir.name
        finally:
            tmp_dir.cleanup()
    else:
        path = os.path.realpath(path)
        yield path


def unzip_file(zip_src, dst_dir, clean_zip_file=True):
    """Unzip a file
    Args:
        zip_src (str): Zip file.
        dst_dir (str): Destination folder.
        clean_zip_file (bool): Whether or not to clean the zip file.
    """
    fz = zipfile.ZipFile(zip_src, "r")
    for file in fz.namelist():
        fz.extract(file, dst_dir)
    if clean_zip_file:
        os.remove(zip_src)


In [None]:
def download_and_extract_globe(dest_path):
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    filepath = maybe_download(url=url, work_directory=dest_path)
    glove_path = os.path.join(dest_path, "glove")
    unzip_file(filepath, glove_path, clean_zip_file=False)
    return glove_path

In [None]:
def load_glove_matrix(path_emb, word_dict, word_embedding_dim):
    '''Load pretrained embedding metrics of words in word_dict
    
    Args: 
        path_emb (string): Folder path of downloaded glove file
        word_dict (dict): word dictionary
        word_embedding_dim: dimention of word embedding vectors
        
    Returns:
        numpy array, list: pretrained word embedding metrics, words can be found in glove files
    '''
    
    embedding_matrix = np.zeros((len(word_dict)+1, word_embedding_dim))
    exist_word=[]

    with open(os.path.join(path_emb, f"glove.6B.{word_embedding_dim}d.txt"),'rb') as f:
        for l in tqdm(f):
            l=l.split()
            word = l[0].decode()
            if len(word) != 0:
                if word in word_dict:
                    wordvec = [float(x) for x in l[1:]]
                    index = word_dict[word]
                    embedding_matrix[index]=np.array(wordvec)
                    exist_word.append(word)
                    
    return embedding_matrix, exist_word

In [None]:
train_zip, valid_zip = download_mind(size=mind_type, dest_path=data_path)
unzip_file(train_zip, os.path.join(data_path, 'train'))
unzip_file(valid_zip, os.path.join(data_path, 'valid'))
output_path = os.path.join(data_path, 'utils')
os.makedirs(output_path, exist_ok=True)

In [None]:
word_cnt = Counter()
word_cnt_all = Counter()
news = pd.read_table(os.path.join(data_path, 'train', 'news.tsv'),
                     names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['vertical', 'subvertical', 'title', 'abstract'])

for i in tqdm(range(len(news))):
    word_cnt.update(news.loc[i]['title'])
    
word_dict = {k: v+1 for k, v in zip(word_cnt, range(len(word_cnt)))}

tmpdir = TemporaryDirectory()
data_path = tmpdir.name
glove_path = download_and_extract_globe(data_path)
embedding_matrix, exist_word = load_glove_matrix(glove_path, word_dict, word_embedding_dim)

In [None]:
np.save(os.path.join(output_path, 'embedding.npy'), embedding_matrix)