In [1]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('axes', grid=True)

from nltk.stem import WordNetLemmatizer

In [2]:
root_dir = 'C:/Users/delst/OneDrive/Desktop/Code/Workspace/NLP_Disaster_Tweets'
sys.path.append(root_dir)

from A_Main.Configurations.setup_env import setup_environment
config = setup_environment(root_dir)

# File Paths
sdo_pkl = config.sdo_pkl
sdo_parq = config.sdo_parq

# Class Imports

fig_size = config.fig_size_m

---

# **Data Load**

In [3]:
filename = 'C2_Initial_Processing/train.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_train = pd.read_parquet(path_to_parq_store)

In [4]:
filename = 'C2_Initial_Processing/test.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_test = pd.read_parquet(path_to_parq_store)

In [5]:
display(df_train.head())
display(df_test.head())

Unnamed: 0,keyword,text,target,tokens
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]"
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se..."
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]"
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]"
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around..."


Unnamed: 0,id,keyword,text,tokens
15,46,ablaze,birmingham wholesale market is ablaze bbc news...,"[birmingham, wholesale, market, is, ablaze, bb..."
16,47,ablaze,@sunkxssedharry will you wear shorts for race ...,"[@sunkxssedharry, will, you, wear, shorts, for..."
17,51,ablaze,#previouslyondoyintv toke makinwaûªs marriage ...,"[#previouslyondoyintv, toke, makinwaûªs, marri..."
18,58,ablaze,check these out #nsfw,"[check, these, out, #nsfw]"
19,60,ablaze,psa iûªm splitting my personalities?? techies ...,"[psa, iûªm, splitting, my, personalities, ?, ?..."


In [6]:
df = df_train
df

Unnamed: 0,keyword,text,target,tokens
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]"
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se..."
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]"
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]"
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around..."
...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]"
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah..."
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word..."
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion..."


---

In [7]:
def extract_hashtags(tokens):
    return [token for token in tokens if token.startswith('#')]

def extract_mentions(tokens):
    return [token for token in tokens if token.startswith('@')]

def extract_questions(tokens):
    return [token for token in tokens if token == '?']

def extract_exclamations(tokens):
    return [token for token in tokens if token == '!']

In [8]:
df['hashtags'] = df['tokens'].apply(extract_hashtags)
df['mentions'] = df['tokens'].apply(extract_mentions)
df['questions'] = df['tokens'].apply(extract_questions)
df['exclamations'] = df['tokens'].apply(extract_exclamations)
df

Unnamed: 0,keyword,text,target,tokens,hashtags,mentions,questions,exclamations
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]","[#metal, #rt]",[],[],[]
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se...",[#africanbaze],[],[],[]
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]",[],[],[],[!]
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]",[],[],[],[]
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around...",[#mufc],[@phdsquares],[],[]
...,...,...,...,...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]",[],"[@jt_ruff23, @cameronhacker]",[],[]
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah...",[],[],[],[]
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word...","[#fx, #forex, #trading]",[],[],[]
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion...",[],[@engineshed],[],[]


---

In [9]:
# # SAVE - Comment out once saved

# df_to_save = df
# filename = 'E1_Feature_Extraction/train.parquet'

# file_path = os.path.join(sdo_parq, filename)
# df_to_save.to_parquet(file_path)

In [10]:
filename = 'E1_Feature_Extraction/train.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_train = pd.read_parquet(path_to_parq_store)
df_train

Unnamed: 0,keyword,text,target,tokens,hashtags,mentions,questions,exclamations
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]","[#metal, #rt]",[],[],[]
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se...",[#africanbaze],[],[],[]
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]",[],[],[],[!]
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]",[],[],[],[]
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around...",[#mufc],[@phdsquares],[],[]
...,...,...,...,...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]",[],"[@jt_ruff23, @cameronhacker]",[],[]
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah...",[],[],[],[]
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word...","[#fx, #forex, #trading]",[],[],[]
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion...",[],[@engineshed],[],[]


---

In [11]:
from Modularization.feature_extraction import FeatureExtractor

feature_extractor = FeatureExtractor(df_train, 'tokens')
df = feature_extractor.extract()

In [12]:
df

Unnamed: 0,keyword,text,target,tokens,hashtags,mentions,questions,exclamations
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]","[#metal, #rt]",[],[],[]
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se...",[#africanbaze],[],[],[]
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]",[],[],[],[!]
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]",[],[],[],[]
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around...",[#mufc],[@phdsquares],[],[]
...,...,...,...,...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]",[],"[@jt_ruff23, @cameronhacker]",[],[]
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah...",[],[],[],[]
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word...","[#fx, #forex, #trading]",[],[],[]
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion...",[],[@engineshed],[],[]
