In [2]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [6]:
import zipfile
zf = zipfile.ZipFile("../data/merged_finance.zip")
df = pd.read_csv(zf.open('merged_finance.csv'))

In [7]:
print(df)

             DATE                                            ARTICLE  \
0      2005-12-09  ['Guy Quaden: The National Bank of Belgium - a...   
1      2009-12-07  ['Guy Quaden: A changing IMF and World Bank\n'...   
2      2018-12-20  ['1.\n', '\n', 'Central Banks and money: an ev...   
3      2002-05-28  ['Guy Quaden: The Euro - a milestone on the pa...   
4      2002-05-28  ['Guy Quaden: The Euro - a milestone on the pa...   
...           ...                                                ...   
17023  2013-08-27  ['Zeti Akhtar Aziz: Grow your business – acces...   
17024  2010-01-28  ['Mohd Razif bin Abd Kadir: Islamic finance an...   
17025  2017-05-26  ['Encik Abdul Rasheed Ghaffour: Revolutionisin...   
17026  2014-05-07  ['Zeti Akhtar Aziz: Nurturing young talent in ...   
17027  2005-09-23  ['Datuk Zamani Abdul Ghani: Role of developmen...   

       Diff_VIX_1d  Diff_VIX_1w  Diff_VIX_2w   OPEN   HIGH    LOW  CLOSE  
0            11.69          NaN          NaN  11.91  12.20  

In [4]:
df.drop(columns = ["DATE", "Diff_VIX_1w", "Diff_VIX_2w", "OPEN", "HIGH", "LOW", "CLOSE"])

Unnamed: 0,ARTICLE,Diff_VIX_1d
0,['Guy Quaden: The National Bank of Belgium - a...,11.69
1,['Guy Quaden: A changing IMF and World Bank\n'...,22.10
2,"['1.\n', '\n', 'Central Banks and money: an ev...",28.38
3,['Guy Quaden: The Euro - a milestone on the pa...,20.31
4,['Guy Quaden: The Euro - a milestone on the pa...,20.31
...,...,...
17023,['Zeti Akhtar Aziz: Grow your business – acces...,16.77
17024,['Mohd Razif bin Abd Kadir: Islamic finance an...,23.73
17025,['Encik Abdul Rasheed Ghaffour: Revolutionisin...,9.81
17026,['Zeti Akhtar Aziz: Nurturing young talent in ...,13.40


In [5]:
df.rename(columns={"Diff_VIX_1d": "VIX_1day"}, inplace=True)

In [6]:
df["ARTICLE"].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)

In [7]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [8]:
df["ARTICLE"] = df["ARTICLE"].progress_apply(lambda x: clean_text(x))
sentences = df["ARTICLE"].apply(lambda x: x.split())

100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:06<00:00, 2623.48it/s]


In [9]:
print(df["ARTICLE"])

0         Guy Quaden The National Bank of Belgium   a c...
1         Guy Quaden A changing IMF and World Bank   Sp...
2         1      Central Banks and money an everchangin...
3         Guy Quaden The Euro   a milestone on the path...
4         Guy Quaden The Euro   a milestone on the path...
                               ...                        
17023     Zeti Akhtar Aziz Grow your business – access ...
17024     Mohd Razif bin Abd Kadir Islamic finance and ...
17025     Encik Abdul Rasheed Ghaffour Revolutionising ...
17026     Zeti Akhtar Aziz Nurturing young talent in Ma...
17027     Datuk Zamani Abdul Ghani Role of development ...
Name: ARTICLE, Length: 17028, dtype: object


In [10]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [11]:
sentences = df["ARTICLE"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|███████████████████████████████████████████████████████████████████████████| 17028/17028 [00:56<00:00, 300.60it/s]
100%|███████████████████████████████████████████████████████████████████████████| 17028/17028 [00:20<00:00, 848.18it/s]

{'Guy': 263, 'Quaden': 86, 'The': 314511, 'National': 10496, 'Bank': 114702}





In [12]:
from gensim.models import KeyedVectors

news_path = '../Embeddings/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [13]:
import operator

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    
    return sorted_x

In [14]:
oov = check_coverage(vocab,embeddings_index)

100%|████████████████████████████████████████████████████████████████████████| 201429/201429 [00:27<00:00, 7394.42it/s]

Found embeddings for 47.94% of vocab
Found embeddings for  86.20% of all text





In [15]:
oov[:10]

[('of', 1794135),
 ('to', 1419839),
 ('and', 1373294),
 ('a', 782094),
 ('–', 99518),
 ('10', 24330),
 ('x0c', 20470),
 ('2008', 20051),
 ('labour', 19186),
 ('2010', 18939)]

In [16]:
sentences = df["ARTICLE"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|███████████████████████████████████████████████████████████████████████████| 17028/17028 [02:07<00:00, 133.63it/s]
100%|███████████████████████████████████████████████████████████████████████████| 17028/17028 [00:42<00:00, 403.94it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:14<00:00, 1178.90it/s]


In [17]:
oov = check_coverage(vocab,embeddings_index)

100%|███████████████████████████████████████████████████████████████████████| 201425/201425 [00:11<00:00, 18301.56it/s]


Found embeddings for 47.94% of vocab
Found embeddings for  96.78% of all text


In [18]:
oov[:20]

[('–', 99518),
 ('10', 24330),
 ('x0c', 20470),
 ('2008', 20051),
 ('labour', 19186),
 ('2010', 18939),
 ('2009', 18588),
 ('2007', 17258),
 ('20', 15507),
 ('15', 15048),
 ('12', 14483),
 ('2000', 13604),
 ('11', 13389),
 ('2006', 12803),
 ('2013', 12568),
 ('2014', 12317),
 ('2015', 11983),
 ('2011', 11693),
 ('2012', 11653),
 ('2001', 11554)]