In [1]:
import pandas as pd
import nltk
nltk.download('words')
from tqdm import tqdm
tqdm.pandas()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Keerthan\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
import zipfile
zf = zipfile.ZipFile("../data/merged_finance.zip")
df = pd.read_csv(zf.open('merged_finance.csv'))

In [3]:
print(df)

             DATE                                            ARTICLE  \
0      2005-12-09  ['Guy Quaden: The National Bank of Belgium - a...   
1      2009-12-07  ['Guy Quaden: A changing IMF and World Bank\n'...   
2      2018-12-20  ['1.\n', '\n', 'Central Banks and money: an ev...   
3      2002-05-28  ['Guy Quaden: The Euro - a milestone on the pa...   
4      2002-05-28  ['Guy Quaden: The Euro - a milestone on the pa...   
...           ...                                                ...   
17023  2013-08-27  ['Zeti Akhtar Aziz: Grow your business – acces...   
17024  2010-01-28  ['Mohd Razif bin Abd Kadir: Islamic finance an...   
17025  2017-05-26  ['Encik Abdul Rasheed Ghaffour: Revolutionisin...   
17026  2014-05-07  ['Zeti Akhtar Aziz: Nurturing young talent in ...   
17027  2005-09-23  ['Datuk Zamani Abdul Ghani: Role of developmen...   

       Diff_VIX_1d  Diff_VIX_1w  Diff_VIX_2w   OPEN   HIGH    LOW  CLOSE  
0            11.69          NaN          NaN  11.91  12.20  

In [5]:
df.drop(columns = ["DATE", "Diff_VIX_1w", "Diff_VIX_2w", "OPEN", "HIGH", "LOW", "CLOSE"])

Unnamed: 0,ARTICLE,Diff_VIX_1d
0,['Guy Quaden: The National Bank of Belgium - a...,11.69
1,['Guy Quaden: A changing IMF and World Bank\n'...,22.10
2,"['1.\n', '\n', 'Central Banks and money: an ev...",28.38
3,['Guy Quaden: The Euro - a milestone on the pa...,20.31
4,['Guy Quaden: The Euro - a milestone on the pa...,20.31
...,...,...
17023,['Zeti Akhtar Aziz: Grow your business – acces...,16.77
17024,['Mohd Razif bin Abd Kadir: Islamic finance an...,23.73
17025,['Encik Abdul Rasheed Ghaffour: Revolutionisin...,9.81
17026,['Zeti Akhtar Aziz: Nurturing young talent in ...,13.40


In [6]:
df.rename(columns={"Diff_VIX_1d": "VIX_1day"}, inplace=True)

In [7]:
df["ARTICLE"].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)

In [22]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in 'â€–?!.,â€™"#$%Ã©\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [10]:
df["ARTICLE"] = df["ARTICLE"].progress_apply(lambda x: clean_text(x))
sentences = df["ARTICLE"].apply(lambda x: x.split())

100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:06<00:00, 2623.77it/s]


In [11]:
# define regular expressions for numbers and links
num_regex = r'\d+' # matches one or more digits
link_regex = r'https?://\S+' # matches http or https followed by any non-space characters

# remove numbers and links from the text column
df['ARTICLE'] = df['ARTICLE'].apply(lambda x: re.sub(num_regex, '', x)) 
df['ARTICLE'] = df['ARTICLE'].apply(lambda x: re.sub(link_regex, '', x)) 

In [12]:
if 'http' not in df['ARTICLE'].values and 'https' not in df['ARTICLE'].values:
    print('URLs have been successfully removed')
else:
    print('URLs have not been removed')
print(df["ARTICLE"])

URLs have been successfully removed
0         Guy Quaden The National Bank of Belgium   a c...
1         Guy Quaden A changing IMF and World Bank   Sp...
2               Central Banks and money an everchanging...
3         Guy Quaden The Euro   a milestone on the path...
4         Guy Quaden The Euro   a milestone on the path...
                               ...                        
17023     Zeti Akhtar Aziz Grow your business  access t...
17024     Mohd Razif bin Abd Kadir Islamic finance and ...
17025     Encik Abdul Rasheed Ghaffour Revolutionising ...
17026     Zeti Akhtar Aziz Nurturing young talent in Ma...
17027     Datuk Zamani Abdul Ghani Role of development ...
Name: ARTICLE, Length: 17028, dtype: object


In [10]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ",text)
    # remove all url links
    text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r"http\S+", "", text)
    
    return text

In [11]:
df["ARTICLE"] = [pre_process(i) for i in df["ARTICLE"]]

In [12]:
print(df["ARTICLE"])

0         guy quaden the national bank of belgium a cen...
1         guy quaden a changing imf and world bank spee...
2         central banks and money an everchanging inter...
3         guy quaden the euro a milestone on the path o...
4         guy quaden the euro a milestone on the path o...
                               ...                        
17023     zeti akhtar aziz grow your business access to...
17024     mohd razif bin abd kadir islamic finance and ...
17025     encik abdul rasheed ghaffour revolutionising ...
17026     zeti akhtar aziz nurturing young talent in ma...
17027     datuk zamani abdul ghani role of development ...
Name: ARTICLE, Length: 17028, dtype: object


In [13]:
words = set(nltk.corpus.words.words())

def clean_sent(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

df['ARTICLE'] = df['ARTICLE'].apply(clean_sent)

In [14]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [15]:
sentences = df["ARTICLE"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:08<00:00, 1959.43it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:10<00:00, 1573.29it/s]


{'guy': 278, 'the': 3440085, 'national': 36097, 'bank': 187831, 'of': 1801753}


In [16]:
from gensim.models import KeyedVectors

news_path = '../../Embeddings/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [17]:
import operator

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    
    return sorted_x

In [18]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████████████████████████████████████████████████████████████████████| 25773/25773 [00:04<00:00, 6034.35it/s]

Found embeddings for 94.27% of vocab
Found embeddings for  86.32% of all text





In [19]:
# top 10 out of vocabulary words with their frequency
oov[:10]

[('of', 1801753),
 ('to', 1442017),
 ('and', 1405911),
 ('a', 828324),
 ('labour', 20570),
 ('behaviour', 5956),
 ('covid', 2996),
 ('analyses', 2329),
 ('analyse', 1064),
 ('doesnt', 1008)]

In [20]:
sentences = df["ARTICLE"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|███████████████████████████████████████████████████████████████████████████| 17028/17028 [00:25<00:00, 671.82it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:08<00:00, 1996.86it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:08<00:00, 1988.04it/s]


In [21]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 25769/25769 [00:00<00:00, 35977.32it/s]

Found embeddings for 94.29% of vocab
Found embeddings for  99.86% of all text





In [22]:
oov[:20]

[('labour', 20570),
 ('behaviour', 5956),
 ('covid', 2996),
 ('analyses', 2329),
 ('analyse', 1064),
 ('doesnt', 1008),
 ('defence', 959),
 ('learnt', 918),
 ('cheque', 622),
 ('didnt', 599),
 ('macao', 555),
 ('mimeo', 442),
 ('channelled', 348),
 ('shouldnt', 347),
 ('wasnt', 291),
 ('channelling', 256),
 ('sabine', 251),
 ('elb', 232),
 ('resolvability', 232),
 ('kiley', 224)]

In [23]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'harbour': 'harbor',
                'flavour': 'flavor',
                'wasnt': 'was not',
                'labour': 'labor',
                'learnt':'learn',
                'defence': 'defense',
                'analyses': 'analysis',
                'behaviour': 'behavior',
                'travelled': 'traveled',
                'channelled': 'channeled',
                'channelling': 'channeling',
                'labour': 'labor',
                'analyse': 'analize',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'
               }

mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [24]:
df["ARTICLE"] = df["ARTICLE"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = df["ARTICLE"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|███████████████████████████████████████████████████████████████████████████| 17028/17028 [00:27<00:00, 618.98it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:06<00:00, 2582.26it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:09<00:00, 1724.48it/s]

{'guy': 278, 'the': 3440085, 'national': 36097, 'bank': 187831, 'of': 1801753}





In [25]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 25757/25757 [00:01<00:00, 17189.31it/s]

Found embeddings for 94.33% of vocab
Found embeddings for  86.41% of all text





In [26]:
oov[:20]

[('of', 1801753),
 ('to', 1442017),
 ('and', 1405911),
 ('a', 828324),
 ('covid', 2996),
 ('cheque', 622),
 ('macao', 555),
 ('mimeo', 442),
 ('sabine', 251),
 ('elb', 232),
 ('resolvability', 232),
 ('kiley', 224),
 ('grey', 173),
 ('exter', 143),
 ('aluminium', 131),
 ('rix', 125),
 ('berne', 124),
 ('saron', 123),
 ('paolo', 116),
 ('pank', 108)]

In [27]:
df["ARTICLE"] = df["ARTICLE"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = df["ARTICLE"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|███████████████████████████████████████████████████████████████████████████| 17028/17028 [00:23<00:00, 734.45it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:04<00:00, 3531.03it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:06<00:00, 2588.63it/s]
100%|██████████████████████████████████████████████████████████████████████████| 17028/17028 [00:07<00:00, 2141.40it/s]


In [28]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 25753/25753 [00:01<00:00, 24891.27it/s]

Found embeddings for 94.35% of vocab
Found embeddings for  99.96% of all text





In [29]:
# top 20 out of vocabulary words with their frequency
oov[:20]

[('covid', 2996),
 ('cheque', 622),
 ('macao', 555),
 ('mimeo', 442),
 ('sabine', 251),
 ('elb', 232),
 ('resolvability', 232),
 ('kiley', 224),
 ('grey', 173),
 ('exter', 143),
 ('aluminium', 131),
 ('rix', 125),
 ('berne', 124),
 ('saron', 123),
 ('paolo', 116),
 ('pank', 108),
 ('enquiry', 99),
 ('whiteside', 98),
 ('faust', 93),
 ('schnabel', 86)]