In [None]:
from sklearn.ensemble import IsolationForest
import pandas as pd
import numpy as np
from datetime import datetime
import random

## Open the database and set up for running the algorithm 

In [None]:
path_to_db_folder = '' #string path to folder where database is
ds = pd.read_excel('{0}\\Globe-LFMC-2.0.xlsx'.format(path_to_db_folder), sheet_name='LFMC data')

In [None]:
globelfmc = ds.copy()
del ds
globelfmc = globelfmc.dropna(subset=['LFMC value (%)']) # in case there are empty rows in the spreadsheet
globelfmc = globelfmc.sort_values(by=['Sorting ID']) # sort using ID column

In [None]:
# transform dates in timestamps to use as input in the statistical model
globelfmc['timestamp'] = pd.to_datetime(globelfmc['Sampling date (YYYYMMDD)'].values).map(lambda x: pd.Timestamp(x).timestamp())

In [None]:
# in case there are spaces at beginning or end
globelfmc['Species collected nospace'] = globelfmc['Species collected'].str.strip()

In [None]:
list_rand = list()
for _ in range(5):
    i = random.randint(0,100)
    list_rand.append(i)


# the output when it was first run is:
list_rand=[67, 99, 1, 23, 25]

In [None]:
# create new columns

for i in list_rand:
    globelfmc['Isolated iforest with LFMC {}'.format(i)] = np.nan

for i in list_rand:
    globelfmc['Isolated iforest no LFMC {}'.format(i)] = np.nan

for i in list_rand:
    globelfmc['Scores iforest with LFMC {}'.format(i)] = np.nan

for i in list_rand:
    globelfmc['Scores iforest no LFMC {}'.format(i)] = np.nan



## Run Isolation Forest for each species/species combination subset

In [None]:
species_list = sorted(set(globelfmc['Species collected nospace']))
len_species = len(species_list)

for i,species in enumerate(species_list):
    print(i+1, len_species, species)

    lfmc_data = globelfmc.loc[(globelfmc['Species collected nospace']==species), ['timestamp','Latitude (WGS84, EPSG:4326)','Longitude (WGS84, EPSG:4326)','LFMC value (%)']]  
    no_lfmc_data = globelfmc.loc[(globelfmc['Species collected nospace']==species), ['timestamp','Latitude (WGS84, EPSG:4326)','Longitude (WGS84, EPSG:4326)']] 

    samples_n = round(len(lfmc_data.index)/100*75) 
    print(samples_n)

    for rand_i in list_rand:
    
        iforest_lfmc = IsolationForest(n_estimators=10000, max_samples=samples_n, contamination=0.05, max_features=4, bootstrap=True, n_jobs=-1, random_state=rand_i)  
        isolated_IF_lfmc = iforest_lfmc.fit_predict(lfmc_data)
        globelfmc.loc[(globelfmc['Species collected nospace']==species), 'Isolated iforest with LFMC {}'.format(rand_i)] = isolated_IF_lfmc
        globelfmc.loc[(globelfmc['Species collected nospace']==species), 'Scores iforest with LFMC {}'.format(rand_i)] = iforest_lfmc.score_samples(lfmc_data)

        iforest_no_lfmc = IsolationForest(n_estimators=10000, max_samples=samples_n, contamination=0.05, max_features=3, bootstrap=True, n_jobs=-1, random_state=rand_i)  
        isolated_IF_no_lfmc = iforest_no_lfmc.fit_predict(no_lfmc_data)
        globelfmc.loc[(globelfmc['Species collected nospace']==species), 'Isolated iforest no LFMC {}'.format(rand_i)] = isolated_IF_no_lfmc
        globelfmc.loc[(globelfmc['Species collected nospace']==species), 'Scores iforest no LFMC {}'.format(rand_i)] = iforest_no_lfmc.score_samples(no_lfmc_data)

        

In [None]:
# sum results all models with different random states
globelfmc['Sum iforest with LFMC'] = globelfmc[['Isolated iforest with LFMC {}'.format(i) for i in list_rand]].sum(axis=1)
globelfmc['Sum iforest no LFMC'] = globelfmc[['Isolated iforest no LFMC {}'.format(i) for i in list_rand]].sum(axis=1)

In [None]:
# assign TRUE or FALSE if isolated
globelfmc['Isolated iforest final'] = np.nan
globelfmc.loc[(globelfmc['Sum iforest with LFMC']==-5) & (globelfmc['Sum iforest no LFMC']>-5), 'Isolated iforest final'] = 'TRUE'
globelfmc.loc[(pd.isna(globelfmc['Isolated iforest final'])), 'Isolated iforest final'] = 'FALSE'

In [None]:
# assign TRUE or FALSE if isolated regardless of LFMC (might be needed for combining methods anomaly detection)
globelfmc['Isolated iforest no LFMC'] = np.nan
globelfmc.loc[globelfmc['Sum iforest no LFMC']==-5, 'Isolated iforest no LFMC'] = 'TRUE'
globelfmc.loc[(pd.isna(globelfmc['Isolated iforest no LFMC'])), 'Isolated iforest no LFMC'] = 'FALSE'

## Save output

In [None]:
globelfmc.to_excel('{}\\Globe-LFMC-2.0_outliers_IF.xlsx'.format(path_to_db_folder), index=False)