In [1]:
import pandas as pd
import re

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
import zipfile
zf = zipfile.ZipFile("../data/merged_finance.zip")
df = pd.read_csv(zf.open('merged_finance.csv'))

In [4]:
print(df)

             DATE                                            ARTICLE  \
0      2005-12-09  ['Guy Quaden: The National Bank of Belgium - a...   
1      2009-12-07  ['Guy Quaden: A changing IMF and World Bank\n'...   
2      2018-12-20  ['1.\n', '\n', 'Central Banks and money: an ev...   
3      2002-05-28  ['Guy Quaden: The Euro - a milestone on the pa...   
4      2002-05-28  ['Guy Quaden: The Euro - a milestone on the pa...   
...           ...                                                ...   
17023  2013-08-27  ['Zeti Akhtar Aziz: Grow your business – acces...   
17024  2010-01-28  ['Mohd Razif bin Abd Kadir: Islamic finance an...   
17025  2017-05-26  ['Encik Abdul Rasheed Ghaffour: Revolutionisin...   
17026  2014-05-07  ['Zeti Akhtar Aziz: Nurturing young talent in ...   
17027  2005-09-23  ['Datuk Zamani Abdul Ghani: Role of developmen...   

       Diff_VIX_1d  Diff_VIX_1w  Diff_VIX_2w   OPEN   HIGH    LOW  CLOSE  
0            11.69          NaN          NaN  11.91  12.20  

In [5]:
df.drop(columns = ["DATE", "Diff_VIX_1w", "Diff_VIX_2w", "OPEN", "HIGH", "LOW", "CLOSE"])

Unnamed: 0,ARTICLE,Diff_VIX_1d
0,['Guy Quaden: The National Bank of Belgium - a...,11.69
1,['Guy Quaden: A changing IMF and World Bank\n'...,22.10
2,"['1.\n', '\n', 'Central Banks and money: an ev...",28.38
3,['Guy Quaden: The Euro - a milestone on the pa...,20.31
4,['Guy Quaden: The Euro - a milestone on the pa...,20.31
...,...,...
17023,['Zeti Akhtar Aziz: Grow your business – acces...,16.77
17024,['Mohd Razif bin Abd Kadir: Islamic finance an...,23.73
17025,['Encik Abdul Rasheed Ghaffour: Revolutionisin...,9.81
17026,['Zeti Akhtar Aziz: Nurturing young talent in ...,13.40


In [6]:
df.rename(columns={"Diff_VIX_1d": "VIX_1day"}, inplace=True)

In [7]:
df["ARTICLE"].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)

In [9]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in 'â€–?!.,â€™"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [10]:
df["ARTICLE"] = df["ARTICLE"].progress_apply(lambda x: clean_text(x))
sentences = df["ARTICLE"].apply(lambda x: x.split())

100%|███████████████████████████████████████████████████████████████████████████| 17028/17028 [00:17<00:00, 973.19it/s]


In [11]:
# define regular expressions for numbers and links
num_regex = r'\d+' # matches one or more digits
link_regex = r'https?://\S+' # matches http or https followed by any non-space characters

# remove numbers and links from the text column
df['ARTICLE'] = df['ARTICLE'].apply(lambda x: re.sub(num_regex, '', x)) 
df['ARTICLE'] = df['ARTICLE'].apply(lambda x: re.sub(link_regex, '', x)) 

In [12]:
if 'http' not in df['ARTICLE'].values and 'https' not in df['ARTICLE'].values:
    print('URLs have been successfully removed')
else:
    print('URLs have not been removed')
print(df["ARTICLE"])

URLs have been successfully removed
0         Guy Quaden The National Bank of Belgium   a c...
1         Guy Quaden A changing IMF and World Bank   Sp...
2               Central Banks and money an everchanging...
3         Guy Quaden The Euro   a milestone on the path...
4         Guy Quaden The Euro   a milestone on the path...
                               ...                        
17023     Zeti Akhtar Aziz Grow your business  access t...
17024     Mohd Razif bin Abd Kadir Islamic finance and ...
17025     Encik Abdul Rasheed Ghaffour Revolutionising ...
17026     Zeti Akhtar Aziz Nurturing young talent in Ma...
17027     Datuk Zamani Abdul Ghani Role of development ...
Name: ARTICLE, Length: 17028, dtype: object


In [13]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [14]:
sentences = df["ARTICLE"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:10<00:00, 1597.99it/s]
100%|███████████████████████████████████████████████████████████████████████████| 17028/17028 [00:18<00:00, 931.89it/s]


{'Guy': 263, 'Quaden': 86, 'The': 314640, 'National': 10497, 'Bank': 114748}


In [15]:
from gensim.models import KeyedVectors

news_path = '../../../Embeddings/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [16]:
import operator

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    
    return sorted_x

In [17]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████████████████████████████████████████████████████████████████| 156872/156872 [00:00<00:00, 246143.95it/s]

Found embeddings for 61.24% of vocab
Found embeddings for  87.95% of all text





In [18]:
oov[:10]

[('of', 1794157),
 ('to', 1419879),
 ('and', 1373341),
 ('a', 783292),
 ('labour', 19189),
 ('programme', 7113),
 ('behaviour', 5859),
 ('ufb', 5425),
 ('favourable', 4868),
 ('xcThe', 4136)]

In [19]:
sentences = df["ARTICLE"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:16<00:00, 1059.94it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:11<00:00, 1493.64it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:16<00:00, 1019.55it/s]


In [20]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████████████████████████████████████████████████████████████████| 156868/156868 [00:00<00:00, 264193.43it/s]


Found embeddings for 61.24% of vocab
Found embeddings for  99.06% of all text


In [21]:
oov[:20]

[('labour', 19189),
 ('programme', 7113),
 ('behaviour', 5859),
 ('ufb', 5425),
 ('favourable', 4868),
 ('xcThe', 4136),
 ('globalisation', 4110),
 ('centre', 3197),
 ('wwwbankofenglandcouk', 3184),
 ('programmes', 2845),
 ('utilisation', 2689),
 ('COVID', 2445),
 ('Brexit', 2334),
 ('liberalisation', 2266),
 ('analyses', 2250),
 ('QQE', 2134),
 ('Riksbanks', 2054),
 ('recognised', 2010),
 ('stabilisation', 1988),
 ('recognise', 1977)]