In [1]:
import sys
import os

import pandas as pd

In [2]:
root_dir = 'C:/Users/delst/OneDrive/Desktop/Code/Workspace/NLP_Disaster_Tweets'
sys.path.append(root_dir)

from A_Root_Dir.Configurations.setup_env import setup_environment
config = setup_environment(root_dir)

# Class Imports
from Modularization.initial_process import InitialProcessing

fig_size = config.fig_size
raw_dtype = config.raw_dtype
true_dtype = config.true_dtype

---

In [3]:
# Now use the loaded paths
train_data_dir = config.raw_data_train
test_data_dir = config.raw_data_test

df_train = pd.read_csv(train_data_dir)
df_test = pd.read_csv(test_data_dir)

display(df_train.head())
display(df_test.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


---

In [4]:
def standard_metadata(df):
    nan_count = df.isna().sum()
    nan_percent = round(nan_count / len(df) * 100, 2)
    
    target_vcs = df.target.value_counts()
    target_percent = round(target_vcs / len(df) * 100, 2)
    
    duplicate_count = df.duplicated().sum()
    duplicate_percent = round(duplicate_count / len(df) * 100, 2)
    
    metadata = {
        'nan_count': nan_count,
        'nan_percent': nan_percent,
        'target_count': target_vcs,
        'target_percent': target_percent,
        'duplicate_count': duplicate_count,
        'duplicate_percent': duplicate_percent
    }
    
    return metadata
metadata = standard_metadata(df_train)
# pprint.pprint(metadata)
metadata

{'nan_count': id             0
 keyword       61
 location    2533
 text           0
 target         0
 dtype: int64,
 'nan_percent': id           0.00
 keyword      0.80
 location    33.27
 text         0.00
 target       0.00
 dtype: float64,
 'target_count': 0    4342
 1    3271
 Name: target, dtype: int64,
 'target_percent': 0    57.03
 1    42.97
 Name: target, dtype: float64,
 'duplicate_count': 0,
 'duplicate_percent': 0.0}

In [5]:
# Get all percent values from dict
metadata_percents = {k: v for k, v in metadata.items() if 'percent' in k}
metadata_percents

{'nan_percent': id           0.00
 keyword      0.80
 location    33.27
 text         0.00
 target       0.00
 dtype: float64,
 'target_percent': 0    57.03
 1    42.97
 Name: target, dtype: float64,
 'duplicate_percent': 0.0}

In [6]:
df_train = df_train.drop('location', axis=1)
df_test = df_test.drop('location', axis=1)

In [7]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [8]:
df_train

Unnamed: 0,id,keyword,text,target
31,48,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,Crying out for more! Set me ablaze,0
35,53,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...
7578,10830,wrecked,@jt_ruff23 @cameronhacker and I wrecked you both,0
7579,10831,wrecked,Three days off from work and they've pretty mu...,0
7580,10832,wrecked,#FX #forex #trading Cramer: Iger's 3 words tha...,0
7581,10833,wrecked,@engineshed Great atmosphere at the British Li...,0


---

In [9]:
# convert to lowercase
df_train['text'] = df_train['text'].str.lower()
df_test['text'] = df_test['text'].str.lower()

---

In [10]:
df_train.head()

Unnamed: 0,id,keyword,text,target
31,48,ablaze,@bbcmtd wholesale markets ablaze http://t.co/l...,1
32,49,ablaze,we always try to bring the heavy. #metal #rt h...,0
33,50,ablaze,#africanbaze: breaking news:nigeria flag set a...,1
34,52,ablaze,crying out for more! set me ablaze,0
35,53,ablaze,on plus side look at the sky last night it was...,0


In [11]:
df_test

Unnamed: 0,id,keyword,text
15,46,ablaze,birmingham wholesale market is ablaze bbc news...
16,47,ablaze,@sunkxssedharry will you wear shorts for race ...
17,51,ablaze,#previouslyondoyintv: toke makinwaûªs marriag...
18,58,ablaze,check these out: http://t.co/roi2nsmejj http:/...
19,60,ablaze,psa: iûªm splitting my personalities.\n\n?? t...
...,...,...,...
3247,10806,wrecked,rt cnbc '3 words from disney ceo bob iger wrec...
3248,10807,wrecked,smackdown tyme this should put me in a good mo...
3249,10816,wrecked,@thrillhho jsyk i haven't stopped thinking abt...
3250,10820,wrecked,@stighefootball begovic has been garbage. he g...


---

In [12]:
# # SAVE - Comment out once saved
# df_to_save = df_test
# file_name = 'c1_init_process_test.parquet'
# save_dir = 'C:/Users/delst/OneDrive/Desktop/Code/Workspace/NLP_Disaster_Tweets/Data/Z_Output_Data'
# save_to_parquet(df_to_save, file_name, save_dir)

In [13]:
train_parq_dir = config.parquet_train
df = pd.read_parquet(train_parq_dir)
df

Unnamed: 0,id,keyword,text,target
31,48,ablaze,@bbcmtd wholesale markets ablaze http://t.co/l...,1
32,49,ablaze,we always try to bring the heavy. #metal #rt h...,0
33,50,ablaze,#africanbaze: breaking news:nigeria flag set a...,1
34,52,ablaze,crying out for more! set me ablaze,0
35,53,ablaze,on plus side look at the sky last night it was...,0
...,...,...,...,...
7578,10830,wrecked,@jt_ruff23 @cameronhacker and i wrecked you both,0
7579,10831,wrecked,three days off from work and they've pretty mu...,0
7580,10832,wrecked,#fx #forex #trading cramer: iger's 3 words tha...,0
7581,10833,wrecked,@engineshed great atmosphere at the british li...,0


---