In [1]:
import sys
import os

import pandas as pd
import pprint

import string
import re
import nltk

In [2]:
root_dir = 'C:/Users/delst/OneDrive/Desktop/Code/Workspace/NLP_Disaster_Tweets'
sys.path.append(root_dir)

from A_Root_Dir.Configurations.setup_env import setup_environment
config = setup_environment(root_dir)

# Class Imports
from Modularization.metadata import BasicMetaData
from Modularization.initial_process import InitialProcessing
from Modularization.initial_process import InitialTextProcessing

# Pipeline Parameters
fig_size = config.fig_size_m
raw_dtype = config.raw_dtype
true_dtype = config.true_dtype

drop_cols = config.drop_cols
text_col = config.text_col

---

In [3]:
# Now use the loaded paths
train_data_dir = config.raw_data_train
test_data_dir = config.raw_data_test

df_train = pd.read_csv(train_data_dir)
df_test = pd.read_csv(test_data_dir)

display(df_train.head())
display(df_test.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


---

In [4]:
get_basic_meta = BasicMetaData(df_train)
metadata = get_basic_meta.generate_basic_metadata(df_train)
pprint.pprint(metadata)

{'duplicate_count': 0,
 'duplicate_percent': 0.0,
 'nan_count': id             0
keyword       61
location    2533
text           0
target         0
dtype: int64,
 'nan_percent': id           0.00
keyword      0.80
location    33.27
text         0.00
target       0.00
dtype: float64,
 'target_count': 0    4342
1    3271
Name: target, dtype: int64,
 'target_percent': 0    57.03
1    42.97
Name: target, dtype: float64}


In [5]:
# Get all percent values from dict
metadata_percents = {k: v for k, v in metadata.items() if 'percent' in k}
metadata_percents

{'nan_percent': id           0.00
 keyword      0.80
 location    33.27
 text         0.00
 target       0.00
 dtype: float64,
 'target_percent': 0    57.03
 1    42.97
 Name: target, dtype: float64,
 'duplicate_percent': 0.0}

In [6]:
def initial_process_wrap(
    df:pd.DataFrame,
    true_dtype:dict,
    drop_cols:list,
    text_col:str,
    ):
    """
    Wrapper function for initial processing steps.

    Args:
        df (pd.DataFrame): DataFrame to process
        drop_cols (list): Cols to drop
        text_col (list): Cols to lower case
    """    
    initial_processor = InitialProcessing(df)
    initial_text_processor = InitialTextProcessing(df, text_col)

    df = initial_text_processor.remove_urls()
    df = initial_text_processor.transform_to_lowercase()
    df = initial_text_processor.apply_remove_chars()
    
    df = initial_processor.transform_dtypes(true_dtype)
    df = initial_processor.dup_nan_drop(drop_cols)
    
    return df

df_train = initial_process_wrap(df_train, true_dtype, drop_cols, text_col)
df_test = initial_process_wrap(df_test, true_dtype, drop_cols, text_col)

  self.df[self.text_col] = self.df[self.text_col].str.replace(r'\s*http?://\S+(\s+|$)', ' ').str.strip()
  self.df[self.text_col] = self.df[self.text_col].str.replace(r'\s*http?://\S+(\s+|$)', ' ').str.strip()


---

In [7]:
df_train.head()

Unnamed: 0,id,keyword,text,target
31,48,ablaze,@bbcmtd wholesale markets ablaze,1
32,49,ablaze,we always try to bring the heavy #metal #rt,0
33,50,ablaze,#africanbaze breaking newsnigeria flag set abl...,1
34,52,ablaze,crying out for more! set me ablaze,0
35,53,ablaze,on plus side look at the sky last night it was...,0


In [8]:
df_test.head()

Unnamed: 0,id,keyword,text
15,46,ablaze,birmingham wholesale market is ablaze bbc news...
16,47,ablaze,@sunkxssedharry will you wear shorts for race ...
17,51,ablaze,#previouslyondoyintv toke makinwaûªs marriage ...
18,58,ablaze,check these out #nsfw
19,60,ablaze,psa iûªm splitting my personalities?? techies ...


---

In [12]:
# SAVE - Comment out once saved
# df_to_save = df_train
# file_name = 'c1_ini_process_train.parquet'

# df_to_save = df_test
# file_name = 'c1_ini_process_test.parquet'

# file_path = os.path.join(root_dir,'Data/Serialised_Data_Objects/Parq', file_name)
# df_to_save.to_parquet(file_path)

In [13]:
train_parq_dir = config.parquet_train
test_parq_dir = config.parquet_test
df = pd.read_parquet(test_parq_dir)
df

Unnamed: 0,id,keyword,text
15,46,ablaze,birmingham wholesale market is ablaze bbc news...
16,47,ablaze,@sunkxssedharry will you wear shorts for race ...
17,51,ablaze,#previouslyondoyintv toke makinwaûªs marriage ...
18,58,ablaze,check these out #nsfw
19,60,ablaze,psa iûªm splitting my personalities?? techies ...
...,...,...,...
3247,10806,wrecked,rt cnbc 3 words from disney ceo bob iger wreck...
3248,10807,wrecked,smackdown tyme this should put me in a good mo...
3249,10816,wrecked,@thrillhho jsyk i havent stopped thinking abt ...
3250,10820,wrecked,@stighefootball begovic has been garbage he go...


---