In [213]:
import pandas as pd
pd.set_option('display.max_columns', 50)

### Loading datasets

In [214]:
messages = pd.read_csv('data/messages.csv')
categories = pd.read_csv('data/categories.csv')

In [215]:
messages.genre.unique()

array(['direct', 'social', 'news'], dtype=object)

In [216]:
messages.shape

(26248, 4)

In [217]:
categories.shape

(26248, 2)

In [220]:
messages

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct
...,...,...,...,...
26243,30261,The training demonstrated how to enhance micro...,,news
26244,30262,A suitable candidate has been selected and OCH...,,news
26245,30263,"Proshika, operating in Cox's Bazar municipalit...",,news
26246,30264,"Some 2,000 women protesting against the conduc...",,news


### Merging Datasets

In [221]:
messages = pd.merge(messages, categories, how = 'left', on = 'id')

### Split `categories` into separate category columns

In [224]:
categories = df['categories'].str.split(';', expand = True)

In [225]:
# Get columns names
categories_colnames = list(categories.iloc[0].apply(lambda x: str(x)[:-2]))

In [226]:
categories.columns = categories_colnames

In [227]:
for col in categories.columns:
    categories[col] = categories[col].apply(lambda x: int(str(x)[-1:]))
    categories[col] = pd.to_numeric(categories[col])

### Replace `categories` column in `df` with new category columns

In [228]:
df.drop('categories', axis = 1, inplace = True)

In [229]:
df_clean = pd.concat([df, categories], axis = 1)

### Remove duplicates

- Checking for duplicates ids

In [239]:
df_clean.drop_duplicates(keep = 'first', inplace = True)

In [241]:
df_clean.isnull().sum() 

id                            0
message                       0
original                  16046
genre                         0
related                       0
request                       0
offer                         0
aid_related                   0
medical_help                  0
medical_products              0
search_and_rescue             0
security                      0
military                      0
child_alone                   0
water                         0
food                          0
shelter                       0
clothing                      0
money                         0
missing_people                0
refugees                      0
death                         0
other_aid                     0
infrastructure_related        0
transport                     0
buildings                     0
electricity                   0
tools                         0
hospitals                     0
shops                         0
aid_centers                   0
other_in

In [253]:
# For the same message there are different categories assigned. The resulting row, 
# will be a combination of the duplicates, joining the categories from all the duplicates of the same id

In [247]:
duplicated_ids = df_clean[df_clean.duplicated('id')].id.unique()

In [265]:
# The original columns doesn't contain any useful information, that is not already in the column 'messages'
df_clean.drop('original', axis = 1, inplace = True)

In [268]:
df_final = df_clean.groupby(['id', 'message', 'genre']).max().reset_index()

In [269]:
# The final dataset contains only unique messages with uniques ids

In [270]:
df_final

Unnamed: 0,id,message,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26175,30261,The training demonstrated how to enhance micro...,news,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26176,30262,A suitable candidate has been selected and OCH...,news,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26177,30263,"Proshika, operating in Cox's Bazar municipalit...",news,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26178,30264,"Some 2,000 women protesting against the conduc...",news,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
