In [1]:
import sys
import os

import pandas as pd
import pprint

import string
import re
import nltk

In [2]:
root_dir = 'C:/Users/delst/OneDrive/Desktop/Code/Workspace/NLP_Disaster_Tweets'
sys.path.append(root_dir)

from A_Main.Configurations.setup_env import setup_environment
config = setup_environment(root_dir)

# Class Imports
from Modularization.metadata import BasicMetaData
from Modularization.initial_process import InitialDataProcessing
from Modularization.initial_process import InitialTextProcessing
from Modularization.corpus_creation import CorpusBowCreator

# File Paths
sdo_pkl = config.sdo_pkl
sdo_parq = config.sdo_parq

# Pipeline Parameters
fig_size = config.fig_size_m
raw_dtype = config.raw_dtype
true_dtype = config.true_dtype

drop_cols = config.drop_cols
text_col = config.text_col

---

In [3]:
raw_data_path_train = config.raw_data_train
raw_data_path_val = config.raw_data_val
raw_data_path_test = config.raw_data_test

df_train = pd.read_csv(raw_data_path_train)
df_val = pd.read_csv(raw_data_path_val)
df_test = pd.read_csv(raw_data_path_test)

In [4]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,4,,,Forest fire near La Ronge Sask. Canada,1
1,5,,,All residents asked to 'shelter in place' are ...,1
2,6,,,"13,000 people receive #wildfires evacuation or...",1
3,7,,,Just got sent this photo from Ruby #Alaska as ...,1
4,8,,,#RockyFire Update => California Hwy. 20 closed...,1
...,...,...,...,...,...
6085,10866,,,Suicide bomber kills 15 in Saudi security site...,1
6086,10867,,,#stormchase Violent Record Breaking EF-5 El Re...,1
6087,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
6088,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1


---

In [5]:
get_basic_meta = BasicMetaData(df_train)
metadata = get_basic_meta.generate_basic_metadata(df_train)
pprint.pprint(metadata)

{'duplicate_count': 0,
 'duplicate_percent': 0.0,
 'nan_count': id             0
keyword       48
location    2020
text           0
target         0
dtype: int64,
 'nan_percent': id           0.00
keyword      0.79
location    33.17
text         0.00
target       0.00
dtype: float64,
 'target_count': 0    3468
1    2622
Name: target, dtype: int64,
 'target_percent': 0    56.95
1    43.05
Name: target, dtype: float64}


In [6]:
# Get all percent values from dict
metadata_percents = {k: v for k, v in metadata.items() if 'percent' in k}
metadata_percents

{'nan_percent': id           0.00
 keyword      0.79
 location    33.17
 text         0.00
 target       0.00
 dtype: float64,
 'target_percent': 0    56.95
 1    43.05
 Name: target, dtype: float64,
 'duplicate_percent': 0.0}

In [7]:
token_col = 'tokens'
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()


def initial_process_wrap(
    df:pd.DataFrame,
    true_dtype:dict,
    drop_cols:list,
    text_col:str,
    token_col:str
    ):
    """
    Wrapper function for initial processing steps.

    Args:
        df (pd.DataFrame): DataFrame to process
        drop_cols (list): Cols to drop
        text_col (list): Cols to lower case
    """    
    initial_processor = InitialDataProcessing(df)
    initial_text_processor = InitialTextProcessing(df, text_col, token_col)
    tokenizer = TweetTokenizer()


    df = initial_text_processor.remove_urls()
    df = initial_text_processor.transform_to_lowercase()
    df = initial_text_processor.apply_remove_chars()
    
    df = initial_processor.transform_dtypes(true_dtype)
    df = initial_processor.dup_nan_drop(drop_cols)
    
    return df

df_train = initial_process_wrap(df_train, true_dtype, drop_cols, text_col,token_col)
df_val = initial_process_wrap(df_val, true_dtype, drop_cols, text_col,token_col)
df_test = initial_process_wrap(df_test, true_dtype, drop_cols, text_col,token_col)

  self.df[self.text_col] = self.df[self.text_col].str.replace(r'\s*http?://\S+(\s+|$)', ' ').str.strip()
  self.df[self.text_col] = self.df[self.text_col].str.replace(r'\s*http?://\S+(\s+|$)', ' ').str.strip()
  self.df[self.text_col] = self.df[self.text_col].str.replace(r'\s*http?://\S+(\s+|$)', ' ').str.strip()


---

In [8]:
df_train.head()

Unnamed: 0,keyword,text,target
25,ablaze,we always try to bring the heavy #metal #rt,0
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1
27,ablaze,crying out for more! set me ablaze,0
28,ablaze,on plus side look at the sky last night it was...,0
29,ablaze,@phdsquares #mufc theyve built so much hype ar...,0


In [9]:
df_test.head()

Unnamed: 0,keyword,text
15,ablaze,birmingham wholesale market is ablaze bbc news...
16,ablaze,@sunkxssedharry will you wear shorts for race ...
17,ablaze,#previouslyondoyintv toke makinwaûªs marriage ...
18,ablaze,check these out #nsfw
19,ablaze,psa iûªm splitting my personalities?? techies ...


---

In [11]:
# # SAVE - Comment out once saved

# df_to_save = df_train
# filename = 'C1_Initial_Processing/train.parquet'

# file_path = os.path.join(sdo_parq, filename)
# df_to_save.to_parquet(file_path)

---

In [14]:
filename = 'C1_Initial_Processing/train.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_train = pd.read_parquet(path_to_parq_store)
df_train

Unnamed: 0,keyword,text,target
25,ablaze,we always try to bring the heavy #metal #rt,0
26,ablaze,#africanbaze breaking newsnigeria flag set abl...,1
27,ablaze,crying out for more! set me ablaze,0
28,ablaze,on plus side look at the sky last night it was...,0
29,ablaze,@phdsquares #mufc theyve built so much hype ar...,0
...,...,...,...
6062,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0
6063,wrecked,three days off from work and theyve pretty muc...,0
6064,wrecked,#fx #forex #trading cramer igers 3 words that ...,0
6065,wrecked,@engineshed great atmosphere at the british li...,0


In [13]:
filename = 'c1_ini_process_test.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df_test = pd.read_parquet(path_to_parq_store)

---