In [227]:
import os
import pandas as pd
import chardet
import re

In [228]:
labels = {
    "no_disaster": 0,
    "earthquake": 1,
    "flood": 2,
    "hurricane": 3,
    "tornado": 4,
    "wildfire": 5
}

In [229]:
dfs = []
with open("data/wildfire-greece_wildfires_2018_train.tsv", "rb") as f:
    result = chardet.detect(f.read())

# Print the detected encoding
print(result)
for file_name in [f for f in os.listdir("data/") if f.endswith(".tsv")]:
    file_path = os.path.join("data/", file_name)
    label = labels[file_name.split("-")[0]]
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    # df["class_label"] = df["class_label"].replace(1, label)
    df["class_label"] = label
    dfs.append(df)

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


In [230]:
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.drop(columns=["tweet_id"], inplace=True)
combined_df.rename(columns={"class_label": "label"}, inplace=True)
combined_df.rename(columns={"tweet_text": "text"}, inplace=True)

In [231]:
combined_df.head()

Unnamed: 0,text,label
0,Powerful Ecuador quake kills at least 235: POR...,1
1,Im at awe and saddened with the #EcuadorEarthq...,1
2,RT @RachelAndJun: Our hearts are with everyone...,1
3,RT @noticias2000: Ecuador quake death toll has...,1
4,RT @pzf: BREAKING PHOTOS: Major damage reporte...,1


In [232]:

combined_df.tail()

Unnamed: 0,text,label
27623,"RT @UPI: Greece wildfires: At least 60 dead, h...",5
27624,"#Greek govt announces instant payment of 5,000...",5
27625,"Nine Lives Greece volunteer, Alexis, interview...",5
27626,RT @keeptalkingGR: Donations for injured pets ...,5
27627,Mr @netanyahu Greece is in desperate need of h...,5


In [233]:

combined_df.describe()

Unnamed: 0,label
count,27628.0
mean,2.804076
std,1.040089
min,1.0
25%,3.0
50%,3.0
75%,3.0
max,5.0


In [234]:
combined_df.isnull().sum()

text     0
label    0
dtype: int64

In [235]:
combined_df.nunique()

text     27628
label        4
dtype: int64

In [236]:
df = combined_df
print(df)

                                                    text  label
0      Powerful Ecuador quake kills at least 235: POR...      1
1      Im at awe and saddened with the #EcuadorEarthq...      1
2      RT @RachelAndJun: Our hearts are with everyone...      1
3      RT @noticias2000: Ecuador quake death toll has...      1
4      RT @pzf: BREAKING PHOTOS: Major damage reporte...      1
...                                                  ...    ...
27623  RT @UPI: Greece wildfires: At least 60 dead, h...      5
27624  #Greek govt announces instant payment of 5,000...      5
27625  Nine Lives Greece volunteer, Alexis, interview...      5
27626  RT @keeptalkingGR: Donations for injured pets ...      5
27627  Mr @netanyahu Greece is in desperate need of h...      5

[27628 rows x 2 columns]


In [237]:
tags = set(df["text"].str.findall(r"<\S+>").sum())
tags

set()

In [238]:
# Define the emoji removal function
import pandas as pd
import re

# Function to remove both emojis and non-standard ASCII characters
def clean_text(text):
    # Define patterns for emojis and non-standard ASCII characters
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"  # other symbols
        u"\U000024C2-\U0001F251"  # enclosed characters
        u"\U0001F926-\U0001F937"  # supplemental symbols
        u"\U0001F910-\U0001F93E"  # more emoticons
        u"\U0001F920-\U0001F927"  # even more emoticons
        u"\U0001F930-\U0001F939"  # family emojis
        u"\U0001F9C0-\U0001F9C2"  # additional symbols
        u"\u200d"                 # zero-width joiner
        u"\u2640-\u2642"          # gender symbols
        u"\u2600-\u2B55"          # miscellaneous symbols
        u"\u23cf"                 # eject button
        u"\u23e9"                 # fast-forward
        u"\u231a"                 # watch
        u"\u3030"                 # wavy dash
        u"\ufe0f"                 # variation selector
        u"\u2069"                 # additional control character
        u"\u20E3"                 # combining enclosing keycap
        "]+", flags=re.UNICODE)

    non_ascii_pattern = re.compile(r'[^\x00-\x7F]+')
    special_chars_pattern = re.compile(r'[@#&$%+-/*\\]')

    # Remove emojis and non-standard ASCII characters
    text = emoji_pattern.sub(r'', text)
    text = non_ascii_pattern.sub(r'', text)
    text = special_chars_pattern.sub(r'', text)
    return text

# Apply the emoji removal function to all columns
df = df.map(lambda x: clean_text(str(x)))


In [239]:
df.to_csv("data/crisisnlp-preprocessed_data.csv", sep="\t", encoding="utf-16", index=False)