## Loading the dataset and building vectors for machine learning algorithms

In [23]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.max_columns=10
# random seed for reproducibility
np.random.seed(42)

Defining utilities functions to work with the dataset

In [24]:
def build_environmental_data(df, patches_dir, mean_window_size=None):

    """This function builds a dataset containing all the latitude,
       longitude, and vectors of the environmental tensors associated saved
       in a directory.
       Used to fit to Scikit-Learn models.
       If the environmental tensors are just row vectors (i.e the env. variables
       values at the location) then it loads them in a new dataframe.
       Otherwise, the tensors are flattened as long row vectors;
       that's when the tensors are the env. variables values around the location.

       :param df: the locations dataframe, containing (Latitude,Longitude)
           columns
       :param patches_dir: the directory where the env. patches are saved
       :param mean_window_size: if not None, takes the mean value of each
       raster on the provided window size
       :return: a new dataframe containing the locations concatenated with
           their env. vectors
    """
    # import the names of the environmental variables
    from environmental_raster_glc import raster_metadata

    env_array = list()
    # number of values per channel, 1 if patches are vector
    n_features_per_channel = 1
    for idx in range(len(df)):

        # get the original index used to write the patches on disk
        true_idx = df.index[idx]
        # find the name of the file
        patch_name = patches_dir + '/' + str(true_idx)+'.npy'
        # reads the file
        patch = np.load(patch_name)
        # build the row vector
        lat, lng = df.loc[true_idx,'Longitude'], df.loc[true_idx,'Latitude']

        if mean_window_size:
            try:
                patch = np.array([ ch[ch.shape[0]//2 - mean_window_size//2:
                                      ch.shape[0]//2 + mean_window_size//2,
                                      ch.shape[1]//2 - mean_window_size//2:
                                      ch.shape[1]//2 + mean_window_size//2
                                     ].mean() for ch in patch
                                 ])
                assert(len(patch.shape)==1)
            except IndexError:
                raise Exception("Channels don't have two dimensions!")
        else:
            if len(patch.shape) > 1:
                n_features_per_channel = patch.shape[0]*patch.shape[1]
            elif len(patch.shape) ==2 :
                raise Exception("Channel of dimension one: should only be a scalar\
                                 or a two dimensional array")
        # flatten to build row vector
        env_array.append(np.concatenate(([lat,lng],patch),axis=None))

    rasters_names = sorted(raster_metadata.keys())
    if n_features_per_channel == 1:
        header_env = rasters_names
    else:
        header_env = []
        for name in rasters_names:
            header_env.extend([name+f'__{i}' for i in range(n_features_per_channel)])
    header = ['Latitude','Longitude'] + header_env
    env_df = pd.DataFrame(env_array, columns=header, dtype='float64')
    return env_df

def get_taxref_names(self, y, taxonomic_names):
    """Returns the taxonomic names which corresponds to the list of
       species ids
       :param y: the list of species
       :return: the list of taxonomic names
    """
    return [taxonomic_names[taxonomic_names['glc19SpId']==spid]['taxaName'].iloc[0]
            for spid in y
           ]

Visualizing the data

In [34]:
# working on a subset of Pl@ntNet Trusted: 2500 occurrences
df = pd.read_csv('example_occurrences.csv',
                 sep=';', header='infer', quotechar='"', low_memory=True)

df = df[['Longitude','Latitude','glc19SpId','scName']]
df = df.dropna(axis=0, how='all') #drop nan lines
df = df.astype({'glc19SpId': 'int64'})

# one liner:
#df = df[['Longitude','Latitude','glc19SpId','scName']].dropna(axis=0, how='all').astype({'glc19SpId': 'int64'})
# target pandas series of the species identifiers (there are 505 labels)
target_df = df['glc19SpId']
# correspondence table between ids and the species taxonomic names
# (Taxref names with year of discoverie)
taxonomic_names = pd.read_csv('../data/occurrences/taxaName_glc19SpId.csv',
                              sep=';',header='infer', quotechar='"',low_memory=True)
# glc_dataset = GLCDataset(df[['Longitude','Latitude']], df['glc19SpId'],
#                          scnames=df[['glc19SpId','scName']],patches_dir='example_envtensors')

print(len(df), 'occurrences in the dataset')
print(len(target_df.unique()), 'number of species\n')
duplicated_df = df[df.duplicated(subset=['Latitude','Longitude'],keep=False)]
print(f'{len(duplicated_df)} entries observed at interfering locations:')
display(duplicated_df.sample(5))

2499 occurrences in the dataset
505 number of species

30 entries observed at interfering locations:


Unnamed: 0,Longitude,Latitude,glc19SpId,scName
2345,-1.319287,47.17834,30425,Solanum dulcamara L.
383,-1.075745,44.97846,31867,Arenaria montana L.
1200,-1.075745,44.97846,31734,Tuberaria guttata (L.) Fourr.
2444,8.803104,41.886303,30683,Nerium oleander L.
796,1.462861,43.53834,29989,Acanthus mollis L.


One example of two interfering examples: at index 383 and index 1200: lat,lng =(44.978460,-1.075745) and species ids = 31867 (Arenaria montana L.) and 31734 (Tuberaria guttata (L.) Fourr.)

In [37]:
env_df = build_environmental_data(df[['Latitude','Longitude']],patches_dir='example_envtensors')
assert(len(df) == len(env_df))
X = env_df.values
y = target_df.values
display(env_df.head(3))
display(target_df.head(5))
# the arrays to train our machine learning models
X = env_df.values
y = target_df.values

Unnamed: 0,Latitude,Longitude,alti,awc_top,bs_top,...,etp,oc_top,pd_top,proxi_eau_fast,text
0,2.118889,43.95195,189.375,165.0,85.0,...,1219.375,1.0,2.0,0.0,2.0
1,-0.5925,45.10639,45.625,120.0,35.0,...,1140.625,1.0,1.0,0.0,1.0
2,-4.534861,48.38958,69.375,0.0,85.0,...,800.625,2.0,2.0,0.0,0.0


0    30021
1    31997
2    31385
3    33228
4    33228
Name: glc19SpId, dtype: int64

## Classifier class and the Vector model

The vector model in the geolocation + environmental space

In [6]:
from vector_model import VectorModel
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state)
clf = VectorModel(metric='euclidean')
clf.fit(X_train,y_train)
print('Euclidean')
y_predicted = clf.predict(X_test)
print(f'  Top30 score:{clf.top30_score(y_predicted, y_test)}')
print(f'  MRR score:{clf.mrr_score(y_predicted, y_test)}')

print('Cosine')
clf = VectorModel(metric='cosine')
clf.fit(X_train,y_train)
y_predicted = clf.predict(X_test)
print(f'  Top30 score:{clf.top30_score(y_predicted, y_test)}')
print(f'  MRR score:{clf.mrr_score(y_predicted, y_test)}')

Euclidean
  Top30 score:0.2
  MRR score:0.05445028194533972
Cosine


TypeError: 'NoneType' object is not subscriptable

In [5]:
from knn_model import KNearestNeighborsModel
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
clf = KNearestNeighborsModel(weights='uniform',metric='euclidean')
clf.fit(X_train,y_train)
print('Uniform weights')
y_predicted = clf.predict(X_test, ranking_size=30)
print(f'  Top30 score:{clf.top30_score(y_predicted, y_test)}')
print(f'  MRR score:{clf.mrr_score(y_predicted, y_test)}')

clf = KNearestNeighborsModel(weights='distance',metric='euclidean')
clf.fit(X_train,y_train)
print('Inverse to distance weights')
y_predicted = clf.predict(X_test, ranking_size=30)
print(f'  Top30 score:{clf.top30_score(y_predicted, y_test)}')
print(f'  MRR score:{clf.mrr_score(y_predicted, y_test)}')
# TODO : débugger distance cosine

Uniform weights
  Top30 score:0.07
  MRR score:0.009346689260969971
Inverse to distance weights
  Top30 score:0.07
  MRR score:0.009346689260969971


The K-nearest-neighbors model

In [11]:
print(clf.knn_clf.classes_)

[29969 29970 29971 29972 29973 29976 29977 29978 29979 29980 29981 29982
 29983 29985 29987 29988 29989 29990 29992 29993 30003 30004 30005 30015
 30016 30018 30021 30023 30024 30025 30026 30029 30032 30033 30040 30041
 30048 30051 30054 30055 30056 30059 30062 30063 30067 30069 30071 30074
 30075 30076 30079 30081 30083 30084 30090 30091 30101 30102 30103 30105
 30106 30113 30114 30115 30119 30120 30123 30126 30133 30134 30136 30144
 30147 30148 30155 30156 30159 30162 30166 30180 30181 30182 30183 30184
 30185 30186 30190 30191 30196 30205 30207 30212 30215 30221 30224 30225
 30226 30229 30230 30244 30248 30253 30255 30256 30261 30264 30270 30272
 30275 30280 30283 30290 30295 30306 30307 30308 30311 30316 30320 30324
 30325 30329 30330 30331 30337 30340 30341 30345 30346 30348 30349 30351
 30352 30353 30354 30358 30360 30363 30367 30379 30401 30402 30405 30406
 30407 30415 30416 30420 30421 30423 30424 30425 30429 30432 30433 30435
 30436 30440 30442 30443 30444 30447 30449 30451 30