# Guide

In [None]:
## IMPORTS
import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

## LOADING
# Load vegetation data
# Load rain data
# Load altitude data
# Load temperature data
# X Load landslides data

## DATA PREPROCESSING
# create slope data from altitude
# scale data in a 0 to 1 range

# X clear landslides data from useless data
# X filter values outside the specified region (50N - 40N, 115W - 125W)
# X filter non-water related landslides
# obtain data from other datasets to complete the positive values of the set

## NEGATIVE DATA CREATION
# Generate random location
# Check if the location is available (h>0)
# Generate random date/hour
# Check if there wasn't a landslide

## CREATE TRAINING, VALIDATION AND TEST SETS
# stratified separation for validation and test sets
# oversampling landslides events (x3?) to account for imbalance
# shuffle training and validation

## DEFINE MODEL
# weighted loss function
# dense deep neural network (architecture?)
# binary classification (which function? sigmoid?)

## TEST PREDICTIONS


# Preparation of the dataset

## Landslides data

In [41]:
import pandas as pd
import numpy as np

landslide_data = pd.read_csv('/content/nasa_global_landslide_catalog_point.csv')

In [4]:
# Overlook at the dataset
print(landslide_data.axes[1])
# List of triggers
landslide_data['landslide_trigger'].value_counts()

Index(['OBJECTID', 'Shape', 'source_name', 'source_link', 'event_id',
       'event_date', 'event_time', 'event_title', 'event_description',
       'location_description', 'location_accuracy', 'landslide_category',
       'landslide_trigger', 'landslide_size', 'landslide_setting',
       'fatality_count', 'injury_count', 'storm_name', 'photo_link',
       'comments', 'event_import_source', 'event_import_id', 'latitude',
       'longitude', 'country_name', 'country_code', 'admin_division_name',
       'gazetteer_closest_point', 'gazetteer_distance', 'submitted_date',
       'last_edited_date'],
      dtype='object')


downpour                   4794
unknown                    2842
rain                       2716
continuous_rain             780
tropical_cyclone            563
snowfall_snowmelt           147
monsoon                     133
mining                      103
construction                 95
earthquake                   95
flooding                     81
no_apparent_trigger          65
freeze_thaw                  44
other                        36
dam_embankment_collapse      15
leaking_pipe                 13
volcano                       2
vibration                     2
Name: landslide_trigger, dtype: int64

In [43]:
# Restricts the area to 50N125W - 40N115W
us_landslides = landslide_data.loc[landslide_data['country_name'] == 'United States'].dropna(subset=['event_date'])
region_landslides = us_landslides.loc[us_landslides['latitude'] > 40]
region_landslides = region_landslides.loc[region_landslides['latitude'] < 50]
region_landslides = region_landslides.loc[region_landslides['longitude'] > -125]
region_landslides = region_landslides.loc[region_landslides['longitude'] < -115]

# Clears the data from landslide triggers independant from the features used
unusable_triggers = ['unknown', 'mining', 'monsoon', 'construction', 'earthquake', 'no_apparent_trigger', 'other', 'dam_embankment_collapse', 'leaking_pipe', 'volcano', 'vibration']

for trigger in unusable_triggers:
  trigger_unusable_idxs = region_landslides[region_landslides.landslide_trigger == trigger].index
  region_landslides.drop(trigger_unusable_idxs, inplace=True)

# Removes the data previous to 2005 for continuity with the other data
threshold = 2005

for idx, event in region_landslides.iterrows():
  date = event.event_date.split()[0]
  year = int(date.split('-')[0])
  if year <= threshold:
    print('Dropping event due to year, id:', idx, '- year:', year)
    region_landslides.drop(idx, inplace=True)

print('\n\n')

region_landslides.value_counts(subset='landslide_trigger')

Dropping event due to year, id: 522 - year: 1997
Dropping event due to year, id: 720 - year: 1997
Dropping event due to year, id: 1059 - year: 1997
Dropping event due to year, id: 1482 - year: 1998
Dropping event due to year, id: 2028 - year: 1996
Dropping event due to year, id: 2213 - year: 1998
Dropping event due to year, id: 2534 - year: 1997
Dropping event due to year, id: 2576 - year: 1997
Dropping event due to year, id: 2579 - year: 1998
Dropping event due to year, id: 3411 - year: 1997
Dropping event due to year, id: 4575 - year: 1996
Dropping event due to year, id: 4960 - year: 1998
Dropping event due to year, id: 5000 - year: 1998
Dropping event due to year, id: 5947 - year: 1997
Dropping event due to year, id: 6197 - year: 1998
Dropping event due to year, id: 6723 - year: 1998
Dropping event due to year, id: 7883 - year: 1998
Dropping event due to year, id: 8136 - year: 1998
Dropping event due to year, id: 8164 - year: 1997
Dropping event due to year, id: 8628 - year: 1998
Dr

landslide_trigger
rain                 319
downpour             302
continuous_rain       37
snowfall_snowmelt     26
flooding              12
freeze_thaw           10
dtype: int64

In [44]:
# Creates a new dataframe with only the features useful for the model
event_dates = region_landslides.event_date
latitudes = region_landslides.latitude
longitudes = region_landslides.longitude
region_landslides_cleaned = pd.DataFrame({'dates': event_dates, 'lat': latitudes, 'long': longitudes})
region_landslides_cleaned.head()

Unnamed: 0,dates,lat,long
1,2009-01-02 02:00:00,45.42,-122.663
6,2012-03-30 00:00:00,48.2797,-117.2665
11,2009-01-01 22:24:00,45.377,-122.0704
13,2009-01-03 00:00:00,45.521,-122.67
16,2009-01-07 00:00:00,47.432,-122.334


In [46]:
# Save the dataframe as csv
'''
region_landslides_cleaned.to_csv('landslides.csv')
'''

In [None]:
# Data initialization for search of best region of interest
'''
min_lat = 20
max_lat = 50
min_long = -125
max_long = -65

count_region = np.zeros([(max_lat - min_lat)//5, (max_long - min_long)//5])
print(count_region)'''

In [None]:
# Algorithm to detect the distribution in the area of the US of landslides and to define temporal range
'''
earliest_year = 2021
earliest_month = 13
earliest_day = 32
earliest_idx = 0

for idx, event in region_landslides.iterrows():
  if event.country_name == 'United States' and min_lat <= event.latitude <= max_lat and min_long <= event.longitude <= max_long:
    x = int((event.latitude - min_lat)//5)
    y = int((event.longitude - min_long)//5)
    count_region[x][y] += 1
    if 4 <= x <= 5 and 0 <= y <= 1:
      try:
        date = event.event_date.split()[0]
      except AttributeError:
        pass
      year = int(date.split('-')[0])
      if year <= earliest_year:
        month = int(date.split('-')[1])
        if year < earliest_year or (year == earliest_year and month <= earliest_month):
          day = int(date.split('-')[2])
          if month < earliest_month or (month == earliest_month and day < earliest_day):
            earliest_day = day
            earliest_idx = idx
          earliest_month = month
        earliest_year = year

print(f'{earliest_year}-{earliest_month}-{earliest_day}')
'''

## Slope data

In [None]:
slope_data = pd.read_csv('slope.csv')

## Vegetation data

In [None]:
vegetation_data = pd.read_csv('vegetation.csv')

## Rain data

In [None]:
rain_data = pd.read_csv('rain.csv')

## Terrain temperature data

In [None]:
terrain_temperature_data = pd.read_csv('terrain_temperature.csv')

# Dataset creation and preprocessing

## Combine data into positive dataset

In [None]:
''' INSTRUCTIONS:
create dataset with data {rain_daily, rain_total, vegetation, temperature, slope, landslide(output)}

for each element of the landslide dataset:
  get latitude, longitude and date
  get inputs from the data already read:
    for vegetation, check if the date is between two sets
    for rain, same, but the time can be probably hourly or interpolated
    for temperature just check the day
    for slope, use the latitude and longitude only
  add the inputs to the dataset, and consider landslide = 1
'''

## Create negative dataset

In [None]:
''' INSTRUCTIONS:
for 5000 times:
  random temporal value between 2006 and 2020
  random latitude between 40 and 50
  random longitude between -125 and -115

  new empty dataset with the same indexes as before

  check it the altitude is >0, maybe in the slope we could use -1 to make it
  readable directly from it

  check if the location had a landslide (passing through all the data for the 
  landslides might be a little slow, but I haven't come up with another idea yet)

  if not, get the inputs from the data
  append the inputs to the dataset, with landslide=0

'''

# Combine the two datasets

In [None]:
''' INSTRUCTIONS:
from the positive dataset take out 50 random elements
from the negative dataset take out 50 random elements

combine these into the test dataset

divide again the two datasets into validation and training (5% and 95%) and
combine the positive and negative parts

copy all the elements that have landslide=1 and append them twice to the dataset
(oversampling, the training will contain 3 copies for each landslide event)
(This with both validation and training sets)

shuffle validation and training set
'''

## Scaling the dataset

In [None]:
''' INSTRUCTIONS:
for each column:
  max(max(training) - min(training), max(validation) - min(validation), max(test) - min(test))
  min(min(training), min(validation), min(test))
  subtract the minimum value and divide all the elements of that column for the max

  OR

  standardization (sklearn.preprocessing.StandardScaler)
'''

# ML model

In [50]:
# Imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics

In [None]:
''' INSTRUCTIONS:
define structure using the sequential model
a few dense layers, 'relu' as a function should be good
(size increasing with depth, so for example 20, 40, 60)
'''
model = Sequential([
                    Dense(units=16, activation='relu', input_shape=(5, ),
                    Dense(units=32, activation='relu'),
                    Dense(units=16, activation='relu'),
                    Dense(units=1, activation='')
])

In [None]:
''' INSTRUCTIONS:
show the results on a precision/recall curve
'''