## Imports

In [89]:
import pandas as pd

## Config

In [90]:
data_dir = "./data"
train_path = f"{data_dir}/train.csv"
test_path = f"{data_dir}/test.csv"

## Load train dataset

In [91]:
train_df = pd.read_csv(train_path)
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Dataset statistics

In [92]:
train_df.shape

(7613, 5)

In [93]:
train_df.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [94]:
col_id = 'id'
col_keyword = 'keyword'
col_location = 'location'
col_text = 'text'
col_target = 'target'

In [95]:
train_df.nunique()

id          7613
keyword      221
location    3341
text        7503
target         2
dtype: int64

In [96]:
train_df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [97]:
train_df.describe(include="all")

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


### Sample keyword column

In [98]:
train_df[col_keyword].sample(n=10)

4570             injuries
4497            hurricane
3904            flattened
1711             collided
5361                panic
4339               hijack
899                bloody
525             avalanche
1498         catastrophic
6455    suicide%20bombing
Name: keyword, dtype: object

### Sample location column

In [99]:
train_df[col_keyword].sample(n=10)

4619         injury
3151      emergency
4798    loud%20bang
1937         curfew
6786        tragedy
3040     earthquake
6303      stretcher
2608      destroyed
5995        screams
4284       hellfire
Name: keyword, dtype: object

## Data cleaning

In [111]:
def text_preproc(df: pd.DataFrame):
    temp_df = df.str.strip()
    temp_df = temp_df.str.lower()
    temp_df = temp_df.str.replace("\n", " ")
    temp_df = temp_df.str.replace('%20', ' ')
    temp_df = temp_df.str.replace(',', ' ')
    temp_df = temp_df.str.replace('-', ' ')
    temp_df = temp_df.str.replace('\s+', ' ', regex=True)
    return temp_df

train_df[col_keyword] = text_col_cleaning(train_df[col_keyword])
train_df[col_location] = text_col_cleaning(train_df[col_location])
train_df[col_text] = text_col_cleaning(train_df[col_text])

In [112]:
train_df[col_keyword].sample(n=10)

6395         suicide bomb
1522         catastrophic
1916               curfew
6418       suicide bomber
2456           derailment
1256    buildings on fire
5452               police
4745            lightning
5008             military
5819               rubble
Name: keyword, dtype: object

In [113]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this #earthquake m...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to 'shelter in place' are ...,1
3,6,,,13000 people receive #wildfires evacuation ord...,1
4,7,,,just got sent this photo from ruby #alaska as ...,1


In [114]:
train_df.nunique()

id          7613
keyword      221
location    3083
text        7500
target         2
dtype: int64

In [None]:
print("Decreased number of unique values in location column for about ~ 200")