In [1]:
import sys
import os
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('axes', grid=True)

import contractions

from nltk.tokenize import TweetTokenizer

In [2]:
root_dir = 'C:/Users/delst/OneDrive/Desktop/Code/Workspace/NLP_Disaster_Tweets'
sys.path.append(root_dir)

from A_Root_Dir.Configurations.setup_env import setup_environment
config = setup_environment(root_dir)

# File Paths
sdo_pkl = config.sdo_pkl
sdo_parq = config.sdo_parq

# Class Imports
from Modularization.corpus_creation import CorpusBowCreator, load_corpus_bow

fig_size = config.fig_size_m

---

In [3]:
filename = 'C1_Initial_Processing/train.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_train = pd.read_parquet(path_to_parq_store)

In [4]:
filename = 'C1_Initial_Processing/test.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_test = pd.read_parquet(path_to_parq_store)

Unnamed: 0,keyword,text
15,ablaze,birmingham wholesale market is ablaze bbc news...
16,ablaze,@sunkxssedharry will you wear shorts for race ...
17,ablaze,#previouslyondoyintv toke makinwaûªs marriage ...
18,ablaze,check these out #nsfw
19,ablaze,psa iûªm splitting my personalities?? techies ...
...,...,...
3247,wrecked,rt cnbc 3 words from disney ceo bob iger wreck...
3248,wrecked,smackdown tyme this should put me in a good mo...
3249,wrecked,@thrillhho jsyk i havent stopped thinking abt ...
3250,wrecked,@stighefootball begovic has been garbage he go...


In [5]:
display(df_train.head())
display(df_test.head())

Unnamed: 0,keyword,text,target
25,ablaze,we always try to bring the heavy #metal #rt,0
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1
27,ablaze,crying out for more! set me ablaze,0
28,ablaze,on plus side look at the sky last night it was...,0
29,ablaze,@phdsquares #mufc theyve built so much hype ar...,0


Unnamed: 0,keyword,text
15,ablaze,birmingham wholesale market is ablaze bbc news...
16,ablaze,@sunkxssedharry will you wear shorts for race ...
17,ablaze,#previouslyondoyintv toke makinwaûªs marriage ...
18,ablaze,check these out #nsfw
19,ablaze,psa iûªm splitting my personalities?? techies ...


In [6]:
df = df_train

---

In [7]:
df['text'] = df['text'].apply(contractions.fix)
df_test['text'] = df_test['text'].apply(contractions.fix)

In [8]:
df

Unnamed: 0,keyword,text,target
25,ablaze,we always try to bring the heavy #metal #rt,0
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1
27,ablaze,crying out for more! set me ablaze,0
28,ablaze,on plus side look at the sky last night it was...,0
29,ablaze,@phdsquares #mufc they have built so much hype...,0
...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0
6063,wrecked,three days off from work and they have pretty ...,0
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0
6065,wrecked,@engineshed great atmosphere at the british li...,0


In [9]:
tokenizer = TweetTokenizer()
df['tokens'] = df['text'].apply(tokenizer.tokenize)
df_test['tokens'] = df_test['text'].apply(tokenizer.tokenize)

In [10]:
df

Unnamed: 0,keyword,text,target,tokens
25,ablaze,we always try to bring the heavy #metal #rt,0,"[we, always, try, to, bring, the, heavy, #meta..."
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se..."
27,ablaze,crying out for more! set me ablaze,0,"[crying, out, for, more, !, set, me, ablaze]"
28,ablaze,on plus side look at the sky last night it was...,0,"[on, plus, side, look, at, the, sky, last, nig..."
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, they, have, built, so, mu..."
...,...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, and, i, wrecked, ..."
6063,wrecked,three days off from work and they have pretty ...,0,"[three, days, off, from, work, and, they, have..."
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word..."
6065,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, at, the, brit..."


In [11]:
processor_1 = CorpusBowCreator.create_corpus(df, 1, 'tokens')
processor_0 = CorpusBowCreator.create_corpus(df, 0, 'tokens')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[self.token_col] = self.df[self.token_col].apply(self.remove_stop_words)


In [12]:
corpus_doc_1 = processor_1.corpus_doc
corpus_word_1 = processor_1.corpus_word

corpus_doc_0 = processor_0.corpus_doc
corpus_word_0 = processor_0.corpus_word

In [13]:
processor_1.generate_bow(corpus_word_1)
bow_1 = processor_1.bow
bow_fd_1 = processor_1.bow_fd

processor_0.generate_bow(corpus_word_0)
bow_0 = processor_0.bow
bow_fd_0 = processor_0.bow_fd

In [14]:
corpus_doc_1, corpus_word_1, bow_1, bow_fd_1 = load_corpus_bow('1')
corpus_doc_0, corpus_word_0, bow_0, bow_fd_0 = load_corpus_bow('0')
corpus_word_sw, bow_sw, bow_fd_sw = load_corpus_bow('sw')

In [15]:
df['tokens'] = df['tokens'].apply(processor_1.remove_stop_words)
df['tokens'] = df['tokens'].apply(processor_0.remove_stop_words)

---

In [16]:
display(df.head())
display(df_test.head())

Unnamed: 0,keyword,text,target,tokens
25,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]"
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se..."
27,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]"
28,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]"
29,ablaze,@phdsquares #mufc they have built so much hype...,0,"[@phdsquares, #mufc, built, much, hype, around..."


Unnamed: 0,keyword,text,tokens
15,ablaze,birmingham wholesale market is ablaze bbc news...,"[birmingham, wholesale, market, is, ablaze, bb..."
16,ablaze,@sunkxssedharry will you wear shorts for race ...,"[@sunkxssedharry, will, you, wear, shorts, for..."
17,ablaze,#previouslyondoyintv toke makinwaûªs marriage ...,"[#previouslyondoyintv, toke, makinwaûªs, marri..."
18,ablaze,check these out #nsfw,"[check, these, out, #nsfw]"
19,ablaze,psa iûªm splitting my personalities?? techies ...,"[psa, iûªm, splitting, my, personalities, ?, ?..."


In [30]:
# SAVE - Comment out once saved

df_to_save = df
filename = 'C2_Initial_Processing/train.parquet'

file_path = os.path.join(sdo_parq, filename)
df_to_save.to_parquet(file_path)

---

In [27]:
filename = 'C2_Initial_Processing/train.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_train = pd.read_parquet(path_to_parq_store)

In [20]:
filename = 'C2_Initial_Processing/test.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_test = pd.read_parquet(path_to_parq_store)

In [21]:
df_train

Unnamed: 0,id,keyword,text,target,tokens
31,48,ablaze,@bbcmtd wholesale markets ablaze,1,"[@bbcmtd, wholesale, markets, ablaze]"
32,49,ablaze,we always try to bring the heavy #metal #rt,0,"[always, try, bring, heavy, #metal, #rt]"
33,50,ablaze,#africanbaze breaking newsnigeria flag set abl...,1,"[#africanbaze, breaking, newsnigeria, flag, se..."
34,52,ablaze,crying out for more! set me ablaze,0,"[crying, !, set, ablaze]"
35,53,ablaze,on plus side look at the sky last night it was...,0,"[plus, side, look, sky, last, night, ablaze]"
...,...,...,...,...,...
7578,10830,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0,"[@jt_ruff23, @cameronhacker, wrecked]"
7579,10831,wrecked,three days off from work and they have pretty ...,0,"[three, days, work, pretty, much, wrecked, hah..."
7580,10832,wrecked,#fx #forex #trading cramer igers 3 words that ...,0,"[#fx, #forex, #trading, cramer, igers, 3, word..."
7581,10833,wrecked,@engineshed great atmosphere at the british li...,0,"[@engineshed, great, atmosphere, british, lion..."


---

# **Summary Pipeline**

In [22]:
# df['text'] = df['text'].apply(contractions.fix)
# df_test['text'] = df_test['text'].apply(contractions.fix)

# tokenizer = TweetTokenizer()
# df['tokens'] = df['text'].apply(tokenizer.tokenize)
# df_test['tokens'] = df_test['text'].apply(tokenizer.tokenize)

# # Load corpus

# processor_1 = CorpusBowCreator.create_corpus(df, 1, 'tokens')
# processor_0 = CorpusBowCreator.create_corpus(df, 0, 'tokens')
# df['tokens'] = df['tokens'].apply(processor_1.remove_stop_words)
# df['tokens'] = df['tokens'].apply(processor_0.remove_stop_words)

In [23]:
from Modularization.corpus_creation import CorpusBowCreatorSingle

In [24]:
def corpus_creation_pipeline(df):
    df['text'] = df['text'].apply(contractions.fix)

    tokenizer = TweetTokenizer()
    df['tokens'] = df['text'].apply(tokenizer.tokenize)

    # Load corpus

    corpus_creator = CorpusBowCreatorSingle.create_corpus(df, 'tokens')
    df['tokens'] = df['tokens'].apply(corpus_creator.remove_stop_words)
    
    return df

# df_train = corpus_creation_pipeline(df_train)
# df_test = corpus_creation_pipeline(df_test)