# GloVe: Global Vectors for Word Representation
About paper: GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

Used libraries: 
- nltk(The Natural Language Toolkit) - Text/word manipulations, we used Tokenizers from nltk for split large text forms to sentances and for indexing words.
- Keras - is an open-source software library, used for text pass through the entire corpus to collect the statistics.
- Tensorflow - is a free and open-source software library for machine learning, supported by Keras.



In [694]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
from IPython.core.debugger import set_trace

import nltk
from nltk import sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize




# Preparing dataset
We took data for project from Wikipedia. Text from first txt file splitted to list, which object contains 1 sentance. After txt file with list of sentances converted to CSV file.

In [667]:
fp = open("dataset/astana_wiki.txt")
data = fp.read()
a_list = nltk.tokenize.sent_tokenize(data)
len_a_list = len(a_list)
f = open("dataset/astana_wiki_splitted.txt", "a")
for i in range(len(a_list)):
    f.write(a_list[i] + '\n')
f.close()

In [668]:
df = pd.read_fwf('dataset/astana_wiki_splitted.txt')
df["target"] = "1"
df.to_csv('dataset/astana_csv.csv')
train = pd.read_csv(f"dataset/astana_csv.csv")
test = pd.read_csv(f"dataset/wiki.csv")

- Cleaning text from all symbols program doesnt recognize, Changing capital letters to lowercase, removing dots and comas


In [670]:
import re


def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)
def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)
import string


def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)
train["text"] = train.text.map(lambda x: remove_URL(x))
train["text"] = train.text.map(lambda x: remove_html(x))
train["text"] = train.text.map(lambda x: remove_emoji(x))
train["text"] = train.text.map(lambda x: remove_punct(x))

from nltk.corpus import stopwords

stop = set(stopwords.words("english"))


def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)
train["text"] = train["text"].map(remove_stopwords)




def get_clean_text(x):
    if type(x) is str:
        x = x.lower()
        x = re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x) 
        #regex to remove to emails
        x = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x)
        #regex to remove URLs
        x = re.sub('RT', "", x)
        #substitute the 'RT' retweet tags with empty spaces
        x = re.sub('[^A-Z a-z]+', '', x)
        #combining all the text excluding rare words.
        #x = ' '.join([t for t in x.split() if t not in rare])
        return x
    else:
        return x
    
 

train["text"] = train["text"].map(get_clean_text)
train.text


0                      nursultan capital city kazakhstan
1      known   astana akmola renamed nursultan march ...
2      city lies banks ishim river northcentral part ...
3       official estimate reported population  within...
4      akmola became capital kazakhstan  since grown ...
                             ...                        
621                                                     
622     nursultan attracted three trillion tenge us b...
623    growth achieved due large number construction ...
624    tourism becomes one factors drive economic gro...
625    nursultan among top ten attractive tourist cit...
Name: text, Length: 626, dtype: object

# Tokenizer
Numerizing every unique word in the text. After take text as an array of that numbers.

In [671]:
from nltk.tokenize import word_tokenize
import nltk


def create_corpus_tk(df):
    corpus = []
    for text in train["text"]:
        words = [word.lower() for word in word_tokenize(text)]
        corpus.append(words)
    return corpus
corpus = create_corpus_tk(train)
num_words = len(corpus)
print(num_words)

626


In [672]:
corpus[0]

['nursultan', 'capital', 'city', 'kazakhstan']

In [673]:
train_size = int(train.shape[0] * 0.8)

train_sentences = train.text[:train_size]
train_labels = train.target[:train_size]

test_sentences = train.text[train_size:]
test_labels = train.target[train_size:]

In [674]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_len = 10
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_sequences

[[1, 3, 2, 4],
 [79, 11, 7, 5, 1, 10, 80, 81, 6, 16, 1, 17],
 [2, 82, 83, 84, 85, 86, 87, 4, 34, 7, 35, 88, 89, 2, 19, 36, 90, 91, 35],
 [92, 93, 94, 95, 34, 2, 96, 97, 98, 2, 99, 42, 43, 3],
 [7, 18, 3, 4, 44, 100, 101, 102, 45, 37, 20, 46, 47],
 [10, 38, 103, 104, 105, 39, 2, 5, 1, 106, 4, 16, 1, 17],
 [37, 1, 40, 2, 38, 107, 40, 108],
 [18, 3, 4, 2, 109, 110, 111],
 [12, 112, 113, 114, 115, 116, 117],
 [118, 41, 4, 1, 119, 39, 120, 121, 122, 123, 124, 125, 126, 48, 41, 127, 128],
 [129, 130, 131, 132, 133, 134, 135],
 [136, 137, 75, 138, 76],
 [21, 139, 49, 50, 140, 141, 51, 142, 143, 144, 2, 145, 146, 147, 148],
 [149, 150, 36, 5, 151],
 [152, 153, 10, 5, 52, 33, 154],
 [2, 53, 54, 155, 156, 157, 158, 22, 53, 54, 159],
 [6, 160, 52, 161, 162, 163, 21, 7],
 [164, 7, 165, 42, 3, 4],
 [166, 5, 11, 51, 3, 2, 6],
 [10, 3, 5, 11, 167, 21, 1, 168, 169, 6, 16, 1, 17, 170, 171],
 [23, 55, 172, 173, 13, 14, 174, 175, 15],
 [12, 13, 14, 176, 177, 178, 179, 180, 181, 182, 56],
 [11, 57, 24, 22

In [675]:
train_padded = pad_sequences(
    train_sequences, maxlen=max_len, truncating="post", padding="post"
)
train_padded

array([[  1,   3,   2, ...,   0,   0,   0],
       [ 79,  11,   7, ...,  81,   6,  16],
       [  2,  82,  83, ...,   4,  34,   7],
       ...,
       [ 21, 139,  49, ..., 142, 143, 144],
       [149, 150,  36, ...,   0,   0,   0],
       [152, 153,  10, ...,   0,   0,   0]])

In [676]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(
    test_sequences, maxlen=max_len, padding="post", truncating="post"
)
test_padded

array([[  2,  53,  54, ...,  22,  53,  54],
       [  6, 160,  52, ...,   7,   0,   0],
       [164,   7, 165, ...,   0,   0,   0],
       ...,
       [ 25, 287,  68, ...,   0,   0,   0],
       [289, 290,  45, ...,   2,   0,   0],
       [  1, 293,  72, ..., 297, 298, 299]])

In [677]:
print(train.text[0])
print(train_sequences[0])
print(test.text[0])
print(test_sequences[0])

nursultan capital city kazakhstan
[1, 3, 2, 4]
Kazakhstan officially the Republic of Kazakhstan is a transcontinental country mainly located in Central Asia with a smaller portion west of the Ural River in Eastern Europe
[2, 53, 54, 155, 156, 157, 158, 22, 53, 54, 159]


In [678]:
word_index = tokenizer.word_index
print("Number of unique words:", len(word_index))

Number of unique words: 310


In [679]:
word_index

{'nursultan': 1,
 'city': 2,
 'capital': 3,
 'kazakhstan': 4,
 'renamed': 5,
 'kazakh': 6,
 'akmola': 7,
 'economic': 8,
 'increased': 9,
 'march': 10,
 'astana': 11,
 'citys': 12,
 'industrial': 13,
 'production': 14,
 'construction': 15,
 'president': 16,
 'nazarbayev': 17,
 'became': 18,
 'special': 19,
 'cities': 20,
 'name': 21,
 'center': 22,
 'nursultans': 23,
 'financial': 24,
 'growth': 25,
 'times': 26,
 'gross': 27,
 'product': 28,
 'projects': 29,
 'small': 30,
 'mediumsized': 31,
 'number': 32,
 'russian': 33,
 'within': 34,
 'region': 35,
 'status': 36,
 'modern': 37,
 'following': 38,
 'parliament': 39,
 'planned': 40,
 'government': 41,
 'almaty': 42,
 'previous': 43,
 'since': 44,
 'one': 45,
 'central': 46,
 'asia': 47,
 'numerous': 48,
 'given': 49,
 'local': 50,
 'means': 51,
 'tselinograd': 52,
 'virgin': 53,
 'lands': 54,
 'economy': 55,
 'engineering': 56,
 'international': 57,
 'july': 58,
 'samrukkazyna': 59,
 'development': 60,
 'attracted': 61,
 'investors': 

# Glove vector dataset 
Glove Twitter Dataset for 27 billion words is dataset used by authors of paper. Every word provided as 25-numbered array which called 25 Dimensional vectors. This big dataset needs for computing word vectors of our text 


In [654]:
embedding_dict = {}
with open("glove.twitter.27B.25d.txt", "r",encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], "float32")
        embedding_dict[word] = vectors
f.close()

In [693]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 25))

for word, i in word_index.items():
    if i < num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec

In [692]:
print(train_padded.shape)
print(train_labels.shape)
print(test_padded.shape)
print(test_labels.shape)

(500, 10)
(500,)
(126, 10)
(126,)


# Training 
By using Keras library we create model of training word representation. Model parameters: vocabular size(num_words), word arrays(embedding_matrix) and length of Dimension. Code doing 10 passes.

In [688]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

model = Sequential()

model.add(Embedding(num_words,25,embeddings_initializer=Constant(embedding_matrix),input_length=max_len,trainable=False,))
model.add(LSTM(100, dropout=0.1))
model.add(Dense(1, activation="sigmoid"))


optimizer = Adam(learning_rate=0.01)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(
    train_padded,
    train_labels,
    epochs=10,
    validation_data=(test_padded, test_labels),
    verbose=1,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [689]:
sequences = tokenizer.texts_to_sequences(test.text)
padded = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")
pred = model.predict(padded)
pred_int = pred.round().astype("int")
pred



array([[0.99999845],
       [0.99999845],
       [0.99999857],
       [0.99999845],
       [0.9999985 ],
       [0.99999857],
       [0.99999857]], dtype=float32)

In [690]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])
for n in range(len(padded)):
    prt = decode(sequences[n])
    print(prt)

kazakhstan kazakhstan country mainly central asia river
also large part
kazakhstan
kazakhstan world country country world
population million one population world people per
since capital known astana
almaty country city


- these words are identified as the main ones in our text