# Guide

In [None]:
## IMPORTS
import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

## LOADING
# Load vegetation data
# Load rain data
# Load altitude data
# Load temperature data
# X Load landslides data

## DATA PREPROCESSING
# create slope data from altitude
# scale data in a 0 to 1 range

# X clear landslides data from useless data
# X filter values outside the specified region (50N - 40N, 115W - 125W)
# X filter non-water related landslides
# obtain data from other datasets to complete the positive values of the set

## NEGATIVE DATA CREATION
# Generate random location
# Check if the location is available (h>0)
# Generate random date/hour
# Check if there wasn't a landslide

## CREATE TRAINING, VALIDATION AND TEST SETS
# stratified separation for validation and test sets
# oversampling landslides events (x3?) to account for imbalance
# shuffle training and validation

## DEFINE MODEL
# weighted loss function
# dense deep neural network (architecture?)
# binary classification (which function? sigmoid?)

## TEST PREDICTIONS


# Preparation of the dataset

## Landslides data

In [41]:
import pandas as pd
import numpy as np

# list of landslide events from COOLR
landslide_data = pd.read_csv('/content/nasa_global_landslide_catalog_point.csv')

In [4]:
# Overlook at the dataset
print(landslide_data.axes[1])
# List of triggers
landslide_data['landslide_trigger'].value_counts()

Index(['OBJECTID', 'Shape', 'source_name', 'source_link', 'event_id',
       'event_date', 'event_time', 'event_title', 'event_description',
       'location_description', 'location_accuracy', 'landslide_category',
       'landslide_trigger', 'landslide_size', 'landslide_setting',
       'fatality_count', 'injury_count', 'storm_name', 'photo_link',
       'comments', 'event_import_source', 'event_import_id', 'latitude',
       'longitude', 'country_name', 'country_code', 'admin_division_name',
       'gazetteer_closest_point', 'gazetteer_distance', 'submitted_date',
       'last_edited_date'],
      dtype='object')


downpour                   4794
unknown                    2842
rain                       2716
continuous_rain             780
tropical_cyclone            563
snowfall_snowmelt           147
monsoon                     133
mining                      103
construction                 95
earthquake                   95
flooding                     81
no_apparent_trigger          65
freeze_thaw                  44
other                        36
dam_embankment_collapse      15
leaking_pipe                 13
volcano                       2
vibration                     2
Name: landslide_trigger, dtype: int64

In [43]:
# Restricts the area to 50N125W - 40N115W
us_landslides = landslide_data.loc[landslide_data['country_name'] == 'United States'].dropna(subset=['event_date'])
region_landslides = us_landslides.loc[us_landslides['latitude'] > 40]
region_landslides = region_landslides.loc[region_landslides['latitude'] < 50]
region_landslides = region_landslides.loc[region_landslides['longitude'] > -125]
region_landslides = region_landslides.loc[region_landslides['longitude'] < -115]

# Clears the data from landslide triggers independant from the features used
unusable_triggers = ['unknown', 'mining', 'monsoon', 'construction', 'earthquake', 'no_apparent_trigger', 'other', 'dam_embankment_collapse', 'leaking_pipe', 'volcano', 'vibration']

for trigger in unusable_triggers:
  trigger_unusable_idxs = region_landslides[region_landslides.landslide_trigger == trigger].index
  region_landslides.drop(trigger_unusable_idxs, inplace=True)

# Removes the data previous to 2005 for continuity with the other data
threshold = 2005

for idx, event in region_landslides.iterrows():
  date = event.event_date.split()[0]
  year = int(date.split('-')[0])
  if year <= threshold:
    print('Dropping event due to year, id:', idx, '- year:', year)
    region_landslides.drop(idx, inplace=True)

print('\n\n')

region_landslides.value_counts(subset='landslide_trigger')

Dropping event due to year, id: 522 - year: 1997
Dropping event due to year, id: 720 - year: 1997
Dropping event due to year, id: 1059 - year: 1997
Dropping event due to year, id: 1482 - year: 1998
Dropping event due to year, id: 2028 - year: 1996
Dropping event due to year, id: 2213 - year: 1998
Dropping event due to year, id: 2534 - year: 1997
Dropping event due to year, id: 2576 - year: 1997
Dropping event due to year, id: 2579 - year: 1998
Dropping event due to year, id: 3411 - year: 1997
Dropping event due to year, id: 4575 - year: 1996
Dropping event due to year, id: 4960 - year: 1998
Dropping event due to year, id: 5000 - year: 1998
Dropping event due to year, id: 5947 - year: 1997
Dropping event due to year, id: 6197 - year: 1998
Dropping event due to year, id: 6723 - year: 1998
Dropping event due to year, id: 7883 - year: 1998
Dropping event due to year, id: 8136 - year: 1998
Dropping event due to year, id: 8164 - year: 1997
Dropping event due to year, id: 8628 - year: 1998
Dr

landslide_trigger
rain                 319
downpour             302
continuous_rain       37
snowfall_snowmelt     26
flooding              12
freeze_thaw           10
dtype: int64

In [44]:
# Creates a new dataframe with only the features useful for the model
event_dates = region_landslides.event_date
latitudes = region_landslides.latitude
longitudes = region_landslides.longitude
region_landslides_cleaned = pd.DataFrame({'dates': event_dates, 'lat': latitudes, 'long': longitudes})
region_landslides_cleaned.head()

Unnamed: 0,dates,lat,long
1,2009-01-02 02:00:00,45.42,-122.663
6,2012-03-30 00:00:00,48.2797,-117.2665
11,2009-01-01 22:24:00,45.377,-122.0704
13,2009-01-03 00:00:00,45.521,-122.67
16,2009-01-07 00:00:00,47.432,-122.334


In [46]:
# Save the dataframe as csv
'''
region_landslides_cleaned.to_csv('landslides.csv')
'''

In [None]:
# Data initialization for search of best region of interest
'''
min_lat = 20
max_lat = 50
min_long = -125
max_long = -65

count_region = np.zeros([(max_lat - min_lat)//5, (max_long - min_long)//5])
print(count_region)'''

In [None]:
# Algorithm to detect the distribution in the area of the US of landslides and to define temporal range
'''
earliest_year = 2021
earliest_month = 13
earliest_day = 32
earliest_idx = 0

for idx, event in region_landslides.iterrows():
  if event.country_name == 'United States' and min_lat <= event.latitude <= max_lat and min_long <= event.longitude <= max_long:
    x = int((event.latitude - min_lat)//5)
    y = int((event.longitude - min_long)//5)
    count_region[x][y] += 1
    if 4 <= x <= 5 and 0 <= y <= 1:
      try:
        date = event.event_date.split()[0]
      except AttributeError:
        pass
      year = int(date.split('-')[0])
      if year <= earliest_year:
        month = int(date.split('-')[1])
        if year < earliest_year or (year == earliest_year and month <= earliest_month):
          day = int(date.split('-')[2])
          if month < earliest_month or (month == earliest_month and day < earliest_day):
            earliest_day = day
            earliest_idx = idx
          earliest_month = month
        earliest_year = year

print(f'{earliest_year}-{earliest_month}-{earliest_day}')
'''

## Slope data

In [None]:
slope_data = pd.read_csv('slope.csv')
'''
This dataset represents a matrix holding the maximum slope of a point in the
region selected, as of now this represents ome of the biggest files, and
therefore one of the hardest to handle, but it is luckily the one that requires
less maintenance due to the small variations of these values in short periods
of time

source: https://www.eorc.jaxa.jp/ALOS/en/aw3d30/index.htm (this is only an
elevation map, the data is then processed to obtain the gradient and derive the
steepest slope)
'''

## Vegetation data

In [None]:
vegetation_data = pd.read_csv('vegetation.csv')
'''
This dataset comes from the Nasa Earth Observations, which uses the MODIS
satellite to determine an index describing the vegetation distribution on a
global scale.
This file requires updating every 16 days/1 month due to the high activity of
wildfires and human deforestation.

source: https://neo.sci.gsfc.nasa.gov/archive/csv/MOD_NDVI_16/
'''

## Rain data

In [None]:
rain_data = pd.read_csv('rain.csv')
'''
This dataset is the biggest one, but differently than the data regarding the
terrain slope it can downloaded through an API, requiring therefore only some
data preprocessing.
This dataset hosts the hourly precipitations of a specific point in the span of
a month, and it can be used for both the istantaneous rainfall and the
cumulative rainfalls, two important factors in landslides.

source: https://sharaku.eorc.jaxa.jp/GSMaP/index.htm
'''

## Terrain temperature data

In [None]:
terrain_temperature_data = pd.read_csv('terrain_temperature.csv')
'''
Terrain temperature dataset also obtained from the MODIS satellite through the
Nasa Earth Observations, it can also be obtained through API, making it not as
problematic as the terrain slope.
This dataset hosts the temperature of the terrain, giving us a good insight on
type of the terrain and how it is affected by rainfalls, an important parameter
for landslides caused by precipitations.

source: https://neo.sci.gsfc.nasa.gov/archive/csv/MOD_LSTD_D/
'''

# Dataset creation and preprocessing

## Combine data into positive dataset

In [None]:
''' INSTRUCTIONS:
create dataset with data {rain_daily, rain_total, vegetation, temperature, slope, landslide(output)}

for each element of the landslide dataset:
  get latitude, longitude and date
  get inputs from the data already read:
    for vegetation, check if the date is between two sets
    for rain, same, but the time can be probably hourly or interpolated
    for temperature just check the day
    for slope, use the latitude and longitude only
  add the inputs to the dataset, and consider landslide = 1
'''

'''
Here the dataset of positive values is composed by using the data obtained from
the landslides repository after cleaning, and obtaining the specific features at
the spot of the event.
We will consider this as the positive output (1) of our binary classifier.
'''

## Create negative dataset

In [None]:
''' INSTRUCTIONS:
for 5000 times:
  random temporal value between 2006 and 2020
  random latitude between 40 and 50
  random longitude between -125 and -115

  new empty dataset with the same indexes as before

  check it the altitude is >0, maybe in the slope we could use -1 to make it
  readable directly from it

  check if the location had a landslide (passing through all the data for the 
  landslides might be a little slow, but I haven't come up with another idea yet)

  if not, get the inputs from the data
  append the inputs to the dataset, with landslide=0

'''

'''
For the negative class we have to create a dataset using the data we collected,
therefore we decided to create a random set of coordinates (both spatial and
temporal) and use those to define a new set of inputs for a new entry.
Before actually considering this entry although we check if this entry coincides
with one of the landslide events, or if the location decided is in a body of
water (defined through the slope through a particular tag, for example -1)
'''

# Combine the two datasets

In [None]:
''' INSTRUCTIONS:
from the positive dataset take out 50 random elements
from the negative dataset take out 50 random elements

combine these into the test dataset

divide again the two datasets into validation and training (5% and 95%) and
combine the positive and negative parts

copy all the elements that have landslide=1 and append them twice to the dataset
(oversampling, the training will contain 3 copies for each landslide event)
(This with both validation and training sets)

shuffle validation and training set
'''

'''
As we have a really skewed dataset, the process of combining the positive (from
data) and negative (randomly generated) datasets is more complex.
We initially removed from both sets a few entries to use as a test dataset and
set those apart until scaling.
The remaining data is then divided into validation and training (the validation
is relatively small compared to the training, as to not reduce further the
amount of positive examples) and in each of the sets the positive part is
oversampled, meaning that 2 more copies of it are added to the dataset,
increasing the ratio between positive and negative examples.
The separation before the oversampling assure us that there won't be leakage of
data between the different sets, as this could preclude the performance testing
later.
'''

## Scaling the dataset

In [None]:
''' INSTRUCTIONS:
for each column:
  max(max(training) - min(training), max(validation) - min(validation), max(test) - min(test))
  min(min(training), min(validation), min(test))
  subtract the minimum value and divide all the elements of that column for the max

  OR

  standardization (sklearn.preprocessing.StandardScaler)
'''

'''
To maintain the same dimensions throughout the datasets, we decided to
standardize all the values (removing the mean value and rescaling by the
standard deviation) based on the maximum of the three sets.
This assures us that the model will work fluently even on the other sets.
'''

# ML model

In [50]:
# Imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics

In [None]:
''' INSTRUCTIONS:
define structure using the sequential model
a few dense layers, 'relu' as a function should be good.
'''
model = Sequential([
                    Dense(units=16, activation='relu', input_shape=(5, ),
                    Dense(units=32, activation='relu'),
                    Dense(units=16, activation='relu'),
                    Dense(units=1, activation='sigmoid')
])
                    
'''
This is an initial idea for the architecture of the model, which isn't highly
complex, but could suffice given the amount of features used as inputs.
To define a better structure we would have to analyse the recall score, as we
are interested to obtain the best results at guessing when the landslides are
probable, as to inform authorities that could analyse the location through
a more in-depth analysis.
Given the type of inputs, a relu activation function seems like a good choice,
as this can work similarly to a threshold, which seems to describe well the
relation between our features and the event.
'''

In [None]:
''' INSTRUCTIONS:
show the results on a precision/recall curve

test the model on the test data
'''

'''
In this section we would mostly just look at the results (recall, F1 score etc.)
as to determine the worth of the model.
Analysing the model on data that has never been seen (test data) would bring us
close to a real-world test, showing us the performances of our dataset on a 
problem at a regional level
'''

# Further Developements

As we are approaching the end of the challenge it is clearly visible that the amount of time required to clean and process the data, to build the infrastructure to obtain data through API and to train and test the model is far more than the one available to us, even more so considering our inexperience with data science.

## Additional work on the model
We decided to focus our energies more on verifying that the actual project would be feasible more than actually programming, therefore a lot of improvements could be made first of all on the model itself, but even more on the features decided, as more complex features, such as soil moisture and type of terrain could grant us a much greater insight on the reasons behind water-born landslides.

## Availability to the public
As of now we decided to present the results of the prediction through an online map, as internet is getting more readily available to the world population day by day, allowing us to reach rural communities that wouldn't be able to get such vital informations otherwise.

With time this project could land as a mobile application or possibly as an analytical system for governement agencies, allowing for faster intervention and prevention.

## Data gathering
Currently the amount of landslides categorized is not immense, so a further application for this project could see the ability for users to declare a landslide happening in the nearby area, granting faster information sharing while also improving the data available to the model.