In [1]:
import sys
import os
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('axes', grid=True)

from nltk.stem import WordNetLemmatizer

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [2]:
root_dir = 'C:/Users/delst/OneDrive/Desktop/Code/Workspace/NLP_Disaster_Tweets'
sys.path.append(root_dir)

from A_Root_Dir.Configurations.setup_env import setup_environment
config = setup_environment(root_dir)

# File Paths
sdo_pkl = config.sdo_pkl
sdo_parq = config.sdo_parq

glove_input_dir = config.GloVe_input_dir
glove_output_dir = config.GloVe_output_dir

# Class Imports
# from Modularization.corpus_bow import TextProcessor
from Modularization.corpus_creation import load_corpus_bow

fig_size = config.fig_size_m

---

In [3]:
filename = 'E1_Feature_Extraction/train.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_train = pd.read_parquet(path_to_parq_store)

In [4]:
display(df_train.head())

Unnamed: 0,keyword,text,target,tokens,hashtags,mentions,questions,exclamations
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]","[#metal, #rt]",[],[],[]
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se...",[#africanbaze],[],[],[]
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]",[],[],[],[!]
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]",[],[],[],[]
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around...",[#mufc],[@phdsquares],[],[]


In [5]:
corpus_doc_1, corpus_word_1, bow_1, bow_fd_1 = load_corpus_bow('1')
corpus_doc_0, corpus_word_0, bow_0, bow_fd_0 = load_corpus_bow('0')
corpus_word_sw, bow_sw, bow_fd_sw = load_corpus_bow('sw')

In [6]:
df = df_train
df

Unnamed: 0,keyword,text,target,tokens,hashtags,mentions,questions,exclamations
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]","[#metal, #rt]",[],[],[]
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se...",[#africanbaze],[],[],[]
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]",[],[],[],[!]
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]",[],[],[],[]
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around...",[#mufc],[@phdsquares],[],[]
...,...,...,...,...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]",[],"[@jt_ruff23, @cameronhacker]",[],[]
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah...",[],[],[],[]
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word...","[#fx, #forex, #trading]",[],[],[]
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion...",[],[@engineshed],[],[]


---

# **Lematisation**

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
def apply_lemmatizer(tokens):
    lemm_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemm_tokens
df['tokens_lemm'] = df['tokens'].apply(apply_lemmatizer)
df

Unnamed: 0,keyword,text,target,tokens,hashtags,mentions,questions,exclamations,tokens_lemm
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]","[#metal, #rt]",[],[],[],"[always, try, bring, heavy, #metal, #rt]"
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se...",[#africanbaze],[],[],[],"[#africanbaze, breaking, newsnigeria, flag, se..."
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]",[],[],[],[!],"[cry, !, set, ablaze]"
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]",[],[],[],[],"[plus, side, look, sky, last, night, ablaze]"
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around...",[#mufc],[@phdsquares],[],[],"[@phdsquares, #mufc, built, much, hype, around..."
...,...,...,...,...,...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]",[],"[@jt_ruff23, @cameronhacker]",[],[],"[@jt_ruff23, @cameronhacker, wrecked]"
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah...",[],[],[],[],"[three, day, work, pretty, much, wrecked, haha..."
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word...","[#fx, #forex, #trading]",[],[],[],"[#fx, #forex, #trading, cramer, igers, 3, word..."
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion...",[],[@engineshed],[],[],"[@engineshed, great, atmosphere, british, lion..."


---

# **GloVe Embeddings**

In [9]:
glove_input_file = os.path.join(glove_input_dir, 'glove.twitter.27B.25d.txt')
word2vec_output_file = os.path.join(glove_output_dir, 'glove.twitter.27B.25d.txt.word2vec')
glove2word2vec(glove_input_file, word2vec_output_file)

  glove2word2vec(glove_input_file, word2vec_output_file)


(1193514, 25)

In [10]:
# Load these vectors into a new model
embed_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [11]:
# Define function to calculate average word vectors
# def average_word_vectors(words, embed_model, num_features):
def average_word_vectors(corpus, embed_model, num_features):

    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    for word in corpus:
        if word in embed_model:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, embed_model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

# Define function to apply average word vectors to each document (list of tokens)
def averaged_word_vectorizer(corpus, embed_model, num_features):
    vocabulary = set(embed_model.index_to_key)
    features = [average_word_vectors(tokenized_sentence, embed_model, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [12]:
# Apply the function to our tokenized texts
glove_features_tokens = averaged_word_vectorizer(corpus=df['tokens'], embed_model=embed_model, num_features=25) # corpus=df['tokens_lemm']
glove_features_keyword = averaged_word_vectorizer(corpus=df['keyword'], embed_model=embed_model, num_features=25) # corpus=df['tokens_lemm']

# glove_features is a 2D numpy array where each row is the vector representation of a document

In [13]:
def transform_data_to_glove_features(df, embed_model, num_features):
    df['tokens'] = df['text'].apply(tokenize)
    glove_features = averaged_word_vectorizer(corpus=df['tokens'], embed_model=embed_model, num_features=num_features)
    return glove_features

---

In [14]:
# # Apply the function to our tokenized texts
# train_features = transform_data_to_glove_features(df, embed_model, 25)
# test_features = transform_data_to_glove_features(test_df, embed_model, 25)

---

In [15]:
glove_features_tokens_file_path = os.path.join(sdo_pkl, 'glove_features_tokens.pkl')
glove_features_keyword_file_path = os.path.join(sdo_pkl, 'glove_features_keyword.pkl')

In [16]:
with open(glove_features_tokens_file_path, 'wb') as file:
    pickle.dump(glove_features_tokens, file)

In [17]:
with open(glove_features_keyword_file_path, 'wb') as file:
    pickle.dump(glove_features_keyword, file)

---

In [18]:
with open(glove_features_tokens_file_path, 'rb') as file:
    glove_features_tokens = pickle.load(file)
glove_features_tokens

array([[-0.448208  ,  0.6966934 , -0.62950339, ...,  0.41236001,
         0.295904  , -0.29517521],
       [-0.28123199, -0.14992025,  0.41934039, ...,  0.33308421,
        -0.39357399, -0.1966228 ],
       [-0.56035998, -0.30126308,  0.53712749, ...,  0.3417015 ,
         0.28107351,  0.19530224],
       ...,
       [-0.44565713,  0.23500086, -0.14528557, ..., -0.01756999,
        -0.83696286, -0.48966571],
       [-0.77007711,  0.19694625,  0.10133375, ..., -0.1089725 ,
        -0.74576937, -0.00999312],
       [-0.38757999,  0.11865657,  0.220263  , ..., -0.06108142,
        -0.57167999, -0.59350286]])

In [19]:
with open(glove_features_keyword_file_path, 'rb') as file:
    glove_features_keyword = pickle.load(file)
glove_features_keyword

array([[0.53803501, 0.21774233, 0.05429499, ..., 0.41333501, 0.10137667,
        0.20029733],
       [0.53803501, 0.21774233, 0.05429499, ..., 0.41333501, 0.10137667,
        0.20029733],
       [0.53803501, 0.21774233, 0.05429499, ..., 0.41333501, 0.10137667,
        0.20029733],
       ...,
       [0.79055715, 0.21167286, 0.06580042, ..., 0.35233899, 0.51870144,
        0.22752613],
       [0.79055715, 0.21167286, 0.06580042, ..., 0.35233899, 0.51870144,
        0.22752613],
       [0.79055715, 0.21167286, 0.06580042, ..., 0.35233899, 0.51870144,
        0.22752613]])

In [20]:
df

Unnamed: 0,keyword,text,target,tokens,hashtags,mentions,questions,exclamations,tokens_lemm
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]","[#metal, #rt]",[],[],[],"[always, try, bring, heavy, #metal, #rt]"
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se...",[#africanbaze],[],[],[],"[#africanbaze, breaking, newsnigeria, flag, se..."
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]",[],[],[],[!],"[cry, !, set, ablaze]"
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]",[],[],[],[],"[plus, side, look, sky, last, night, ablaze]"
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around...",[#mufc],[@phdsquares],[],[],"[@phdsquares, #mufc, built, much, hype, around..."
...,...,...,...,...,...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]",[],"[@jt_ruff23, @cameronhacker]",[],[],"[@jt_ruff23, @cameronhacker, wrecked]"
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah...",[],[],[],[],"[three, day, work, pretty, much, wrecked, haha..."
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word...","[#fx, #forex, #trading]",[],[],[],"[#fx, #forex, #trading, cramer, igers, 3, word..."
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion...",[],[@engineshed],[],[],"[@engineshed, great, atmosphere, british, lion..."


---

In [21]:
# SAVE - Comment out once saved

df_to_save = df
filename = 'F1_Preprocessing/train.parquet'

file_path = os.path.join(sdo_parq, filename)
df_to_save.to_parquet(file_path)

In [22]:
filename = 'F1_Preprocessing/train.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_train = pd.read_parquet(path_to_parq_store)
df_train

Unnamed: 0,keyword,text,target,tokens,hashtags,mentions,questions,exclamations,tokens_lemm
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]","[#metal, #rt]",[],[],[],"[always, try, bring, heavy, #metal, #rt]"
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se...",[#africanbaze],[],[],[],"[#africanbaze, breaking, newsnigeria, flag, se..."
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]",[],[],[],[!],"[cry, !, set, ablaze]"
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]",[],[],[],[],"[plus, side, look, sky, last, night, ablaze]"
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around...",[#mufc],[@phdsquares],[],[],"[@phdsquares, #mufc, built, much, hype, around..."
...,...,...,...,...,...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]",[],"[@jt_ruff23, @cameronhacker]",[],[],"[@jt_ruff23, @cameronhacker, wrecked]"
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah...",[],[],[],[],"[three, day, work, pretty, much, wrecked, haha..."
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word...","[#fx, #forex, #trading]",[],[],[],"[#fx, #forex, #trading, cramer, igers, 3, word..."
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion...",[],[@engineshed],[],[],"[@engineshed, great, atmosphere, british, lion..."


---

In [23]:
from Modularization.preprocessing import Preprocessor
from Modularization.preprocessing import Embeddings 

In [24]:
glove_input_file = os.path.join(glove_input_dir, 'glove.twitter.27B.25d.txt')
word2vec_output_file = os.path.join(glove_output_dir, 'glove.twitter.27B.25d.txt.word2vec')
glove2word2vec(glove_input_file, word2vec_output_file)

embed_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

  glove2word2vec(glove_input_file, word2vec_output_file)


In [25]:
def preprocessing_pipeline(df):
    preprocessor = Preprocessor(df, 'tokens')    
    lemmatize_tokens = preprocessor.apply_lemmatizer()
    
    embedder = Embeddings(embed_model, 25)
    embed_tokens = embedder.averaged_word_vectorizer(df['tokens'])
    embed_keyword = embedder.averaged_word_vectorizer(df['keyword'])
    
    
    return embed_tokens, embed_keyword


# preprocessing_pipeline(df)

In [26]:
df

Unnamed: 0,keyword,text,target,tokens,hashtags,mentions,questions,exclamations,tokens_lemm
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]","[#metal, #rt]",[],[],[],"[always, try, bring, heavy, #metal, #rt]"
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se...",[#africanbaze],[],[],[],"[#africanbaze, breaking, newsnigeria, flag, se..."
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]",[],[],[],[!],"[cry, !, set, ablaze]"
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]",[],[],[],[],"[plus, side, look, sky, last, night, ablaze]"
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around...",[#mufc],[@phdsquares],[],[],"[@phdsquares, #mufc, built, much, hype, around..."
...,...,...,...,...,...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]",[],"[@jt_ruff23, @cameronhacker]",[],[],"[@jt_ruff23, @cameronhacker, wrecked]"
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah...",[],[],[],[],"[three, day, work, pretty, much, wrecked, haha..."
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word...","[#fx, #forex, #trading]",[],[],[],"[#fx, #forex, #trading, cramer, igers, 3, word..."
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion...",[],[@engineshed],[],[],"[@engineshed, great, atmosphere, british, lion..."
