# Data Pre-processing
## Import data

In [1]:
import pandas as pd

In [2]:
original_dataset = pd.read_csv('1900_2021_DISASTERS.csv')

In [3]:
original_dataset.head(5)

Unnamed: 0,Year,Seq,Glide,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,...,No Affected,No Homeless,Total Affected,Insured Damages ('000 US$),Total Damages ('000 US$),CPI,Adm Level,Admin1 Code,Admin2 Code,Geo Locations
0,1900,9002,,Natural,Climatological,Drought,Drought,,,Cabo Verde,...,,,,,,3.221647,,,,
1,1900,9001,,Natural,Climatological,Drought,Drought,,,India,...,,,,,,3.221647,,,,
2,1902,12,,Natural,Geophysical,Earthquake,Ground movement,,,Guatemala,...,,,,,25000.0,3.350513,,,,
3,1902,3,,Natural,Geophysical,Volcanic activity,Ash fall,,Santa Maria,Guatemala,...,,,,,,3.350513,,,,
4,1902,10,,Natural,Geophysical,Volcanic activity,Ash fall,,Santa Maria,Guatemala,...,,,,,,3.350513,,,,


In [4]:
print(str(len(original_dataset)) + ' entries')

16126 entries


In [5]:
original_dataset.columns

Index(['Year', 'Seq', 'Glide', 'Disaster Group', 'Disaster Subgroup',
       'Disaster Type', 'Disaster Subtype', 'Disaster Subsubtype',
       'Event Name', 'Country', 'ISO', 'Region', 'Continent', 'Location',
       'Origin', 'Associated Dis', 'Associated Dis2', 'OFDA Response',
       'Appeal', 'Declaration', 'Aid Contribution', 'Dis Mag Value',
       'Dis Mag Scale', 'Latitude', 'Longitude', 'Local Time', 'River Basin',
       'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month',
       'End Day', 'Total Deaths', 'No Injured', 'No Affected', 'No Homeless',
       'Total Affected', 'Insured Damages ('000 US$)',
       'Total Damages ('000 US$)', 'CPI', 'Adm Level', 'Admin1 Code',
       'Admin2 Code', 'Geo Locations'],
      dtype='object')

## Discard columns

In [6]:
# columns to keep
cols = ['Year', 'Disaster Subgroup', 'Disaster Type', 'Disaster Subtype', 'Country', 'Region', 'Continent']
dataset = original_dataset[cols]
dataset.columns = dataset.columns.str.replace(' ','_').str.lower()
dataset.head(5)

Unnamed: 0,year,disaster_subgroup,disaster_type,disaster_subtype,country,region,continent
0,1900,Climatological,Drought,Drought,Cabo Verde,Western Africa,Africa
1,1900,Climatological,Drought,Drought,India,Southern Asia,Asia
2,1902,Geophysical,Earthquake,Ground movement,Guatemala,Central America,Americas
3,1902,Geophysical,Volcanic activity,Ash fall,Guatemala,Central America,Americas
4,1902,Geophysical,Volcanic activity,Ash fall,Guatemala,Central America,Americas


## Combining/manipulating regions
- Combine melanesia, polynesia, and micronesia into one region (Oceania Islands)
- Categorize "Russian Federation" as Western Asia

In [7]:
dataset['region'].value_counts()

Southern Asia                2068
South-Eastern Asia           1939
Eastern Asia                 1840
South America                1283
Northern America             1237
Eastern Africa               1159
Central America               823
Western Africa                806
Southern Europe               650
Caribbean                     628
Eastern Europe                547
Western Europe                528
Western Asia                  499
Middle Africa                 428
Northern Africa               345
Australia and New Zealand     326
Melanesia                     258
Northern Europe               212
Southern Africa               208
Central Asia                  144
Polynesia                      94
Russian Federation             60
Micronesia                     44
Name: region, dtype: int64

In [8]:
dataset['continent'].value_counts()

Asia        6490
Americas    3971
Africa      2946
Europe      1997
Oceania      722
Name: continent, dtype: int64

In [9]:
oceania_islands = ['Melanesia', 'Polynesia', 'Micronesia']
d = dataset.copy()
d.loc[dataset['region'].isin(oceania_islands), 'region'] = 'Oceania Islands'
d.loc[dataset['region'] == 'Russian Federation', 'region'] = 'Western Asia'
d['region'].value_counts()

Southern Asia                2068
South-Eastern Asia           1939
Eastern Asia                 1840
South America                1283
Northern America             1237
Eastern Africa               1159
Central America               823
Western Africa                806
Southern Europe               650
Caribbean                     628
Western Asia                  559
Eastern Europe                547
Western Europe                528
Middle Africa                 428
Oceania Islands               396
Northern Africa               345
Australia and New Zealand     326
Northern Europe               212
Southern Africa               208
Central Asia                  144
Name: region, dtype: int64

## Categorizing decades
Put each year into a "decade" bin.

In [10]:
def map_decade(year):
    if year >= 2020:
        return 2020
    elif year >= 2010:
        return 2010
    elif year >= 2000:
        return 2000
    elif year >= 1990:
        return 1990
    elif year >= 1980:
        return 1980
    elif year >= 1970:
        return 1970
    elif year >= 1960:
        return 1960
    elif year >= 1950:
        return 1950
    elif year >= 1940:
        return 1940
    elif year >= 1930:
        return 1930
    elif year >= 1920:
        return 1920
    elif year >= 1910:
        return 1910
    else:
        return 1900

d['decade'] = d['year'].map(map_decade)
d['decade'].value_counts()

2000    4476
2010    3768
1990    2975
1980    1801
1970     911
2020     713
1960     602
1950     310
1940     171
1930     134
1920     108
1900      79
1910      78
Name: decade, dtype: int64

## Dataset overview & export

In [11]:
d.head(5)

Unnamed: 0,year,disaster_subgroup,disaster_type,disaster_subtype,country,region,continent,decade
0,1900,Climatological,Drought,Drought,Cabo Verde,Western Africa,Africa,1900
1,1900,Climatological,Drought,Drought,India,Southern Asia,Asia,1900
2,1902,Geophysical,Earthquake,Ground movement,Guatemala,Central America,Americas,1900
3,1902,Geophysical,Volcanic activity,Ash fall,Guatemala,Central America,Americas,1900
4,1902,Geophysical,Volcanic activity,Ash fall,Guatemala,Central America,Americas,1900


`disaster_subtype` is the only column with missing values:

In [12]:
print(str(len(d[d.disaster_subtype.isna()])) + ' rows with empty disaster_subtype value')

3110 rows with empty disaster_subtype value


Stats for the three main fields we care about (region, disaster type, and decade):

In [13]:
print(str(len(d.region.value_counts())) + ' unique values for region:')
print()
print(d.region.value_counts())

20 unique values for region:

Southern Asia                2068
South-Eastern Asia           1939
Eastern Asia                 1840
South America                1283
Northern America             1237
Eastern Africa               1159
Central America               823
Western Africa                806
Southern Europe               650
Caribbean                     628
Western Asia                  559
Eastern Europe                547
Western Europe                528
Middle Africa                 428
Oceania Islands               396
Northern Africa               345
Australia and New Zealand     326
Northern Europe               212
Southern Africa               208
Central Asia                  144
Name: region, dtype: int64


In [14]:
print(str(len(d.disaster_type.value_counts())) + ' disaster types:')
print()
print(d.disaster_type.value_counts())

15 disaster types:

Flood                    5551
Storm                    4496
Earthquake               1544
Epidemic                 1501
Landslide                 776
Drought                   770
Extreme temperature       603
Wildfire                  471
Volcanic activity         265
Insect infestation         96
Mass movement (dry)        48
Glacial lake outburst       2
Animal accident             1
Impact                      1
Fog                         1
Name: disaster_type, dtype: int64


In [15]:
print(str(len(d.decade.value_counts())) + ' decade bins:')
print()
print(d.decade.value_counts().sort_index())

13 decade bins:

1900      79
1910      78
1920     108
1930     134
1940     171
1950     310
1960     602
1970     911
1980    1801
1990    2975
2000    4476
2010    3768
2020     713
Name: decade, dtype: int64


Export to a new csv file:

In [16]:
d.to_csv('disasters_processed.csv', index = False)