In [18]:
# Import libraries
import pandas as pd
from itertools import combinations
import numpy as np
import collections

In [19]:
gbif_r = pd.read_csv('gbif_r.csv')

In [20]:
# New dataframe with the lat-long mean and median per year and specie
dif = pd.DataFrame(gbif_r.groupby(['year', 'species'])['lat'].mean())
dif = dif.rename(columns = {'lat': 'lat_mean'})
dif['long_mean'] = gbif_r.groupby(['year', 'species'])['long'].mean()
dif['long_med'] = gbif_r.groupby(['year', 'species'])['long'].median()
dif['lat_med'] = gbif_r.groupby(['year', 'species'])['lat'].median()
dif['regions'] = gbif_r.groupby(['year', 'species'])['region'].agg(lambda x:x.value_counts().index[0])

dif.reset_index(inplace = True)  


In [21]:
# create an empty dataframe
out = pd.DataFrame()

# select species in common in both study years
species20 = list(dif.species[dif['year'] == 2020])
species01 = list(dif.species[dif['year'] == 2001])

spec = list(set(species01).intersection(species20))

# sort the common species
spec.sort()

# set species as the index in our dataframe
out = pd.DataFrame(index = spec)


# lat_mean
for s in out.index:
    dif_spec = dif[dif["species"] == s]
    
    # want to maintain real mean lat and long 
    out.loc[s,'lat'] = dif.lat_mean[dif.species == s].iloc[0]
    out.loc[s,'long'] = dif.long_mean[dif.species == s].iloc[0]

    aux_y1 = dif_spec.lat_mean[dif_spec['year'] == 2001].iloc[0]
    aux_y2 = dif_spec.lat_mean[dif_spec['year'] == 2020].iloc[0]

    out.loc[s, 'lat_mean_01-20'] = aux_y2 - aux_y1

# long_mean
for s in out.index:
    dif_spec = dif[dif["species"] == s]

    aux_y1 = dif_spec.long_mean[dif_spec['year'] == 2001].iloc[0]
    aux_y2 = dif_spec.long_mean[dif_spec['year'] == 2020].iloc[0]

    out.loc[s, 'long_mean_01-20'] = aux_y2 - aux_y1

# lat_med
for s in out.index:
    dif_spec = dif[dif["species"] == s]

    aux_y1 = dif_spec.lat_med[dif_spec['year'] == 2001].iloc[0]
    aux_y2 = dif_spec.lat_med[dif_spec['year'] == 2020].iloc[0]

    out.loc[s, 'lat_med_01-20'] = aux_y2 - aux_y1

# long_med
for s in out.index:
    dif_spec = dif[dif["species"] == s]

    aux_y1 = dif_spec.long_med[dif_spec['year'] == 2001].iloc[0]
    aux_y2 = dif_spec.long_med[dif_spec['year'] == 2020].iloc[0]

    out.loc[s, 'long_med_01-20'] = aux_y2 - aux_y1

out

Unnamed: 0,lat,long,lat_mean_01-20,long_mean_01-20,lat_med_01-20,long_med_01-20
Accipiter gentilis,41.475405,-3.764305,-0.363295,0.997845,-0.391531,1.139175
Accipiter nisus,40.340947,-4.042108,0.050245,0.670222,-0.310430,1.294906
Acrocephalus arundinaceus,41.010709,-3.688138,-0.485551,1.106905,-1.375074,2.313438
Acrocephalus melanopogon,40.521891,2.150216,-0.918729,-2.517785,-0.919915,-3.362374
Acrocephalus paludicola,41.894325,-4.745975,-0.098263,1.596620,-0.226017,2.313438
...,...,...,...,...,...,...
Upupa epops,38.517686,-5.030357,1.098976,1.486455,-0.051227,1.557742
Uria aalge,43.484166,-4.960995,-0.059705,1.141408,-0.028250,2.233221
Vanellus vanellus,41.257987,-4.122154,-0.925168,0.322991,-1.666814,0.109112
Xema sabini,43.273164,-5.682464,-0.072182,-1.048437,-0.474348,-4.535075


In [22]:
# look for outliers in mean latitude
mean_lat = out['lat_mean_01-20'].abs().sort_values(ascending = False)

# look for outliers in mean longitude
mean_long = out['long_mean_01-20'].abs().sort_values(ascending = False)

# look for outliers in median latitude
med_lat = out['lat_med_01-20'].abs().sort_values(ascending = False)

# look for outliers in median longitude
med_long = out['long_med_01-20'].abs().sort_values(ascending = False)


In [23]:
# creating a unique list with all outlier species

# 2 values up to 4 in mean_lat
mean_lat_list = mean_lat[:6]
mean_lat_list = mean_lat_list.index.values.tolist()

# 7 values up to 4 in mean_long
mean_long_list = mean_long[:16]
mean_long_list = mean_long_list.index.values.tolist()

# 4 values up to 4 in med_lat
med_lat_list = med_lat[:13]
med_lat_list = med_lat_list.index.values.tolist()

# 13 values up to 4 in med_long
med_long_list = med_long[:26]
med_long_list = med_long_list.index.values.tolist()

# unique list 
outlier = mean_lat_list + mean_long_list + med_lat_list + med_long_list

In [24]:
# unique outliers list without duplicates
outliers = set(outlier)
outliers = list(outliers)
outliers = sorted(outliers)
len(outliers)

35

In [25]:
# list of species repeated in mean and median change-distribution lists
re_species = sorted([item for item, count in collections.Counter(outlier).items() if count > 1])
print(len(re_species))

19


In [26]:
# new column that show us the course of the change of the species
def compass(df):
    df['compass'] = np.nan
    df['compass'] = np.where((df['lat_mean_01-20'] > 0) & (df['long_mean_01-20'] > 0), 'NE', df['compass'])
    df['compass'] = np.where((df['lat_mean_01-20'] > 0) & (df['long_mean_01-20'] == 0), 'N', df['compass'])
    df['compass'] = np.where((df['lat_mean_01-20'] > 0) & (df['long_mean_01-20'] < 0), 'NW', df['compass'])
    df['compass'] = np.where((df['lat_mean_01-20'] == 0) & (df['long_mean_01-20'] < 0), 'W', df['compass'])
    df['compass'] = np.where((df['lat_mean_01-20'] < 0) & (df['long_mean_01-20'] < 0), 'SW', df['compass'])
    df['compass'] = np.where((df['lat_mean_01-20'] < 0) & (df['long_mean_01-20'] == 0), 'S', df['compass'])
    df['compass'] = np.where((df['lat_mean_01-20'] < 0) & (df['long_mean_01-20'] > 0), 'SE', df['compass'])
    df['compass'] = np.where((df['lat_mean_01-20'] == 0) & (df['long_mean_01-20'] > 0), 'E', df['compass'])
    
    return df

In [27]:
compass(out).head()

Unnamed: 0,lat,long,lat_mean_01-20,long_mean_01-20,lat_med_01-20,long_med_01-20,compass
Accipiter gentilis,41.475405,-3.764305,-0.363295,0.997845,-0.391531,1.139175,SE
Accipiter nisus,40.340947,-4.042108,0.050245,0.670222,-0.31043,1.294906,NE
Acrocephalus arundinaceus,41.010709,-3.688138,-0.485551,1.106905,-1.375074,2.313438,SE
Acrocephalus melanopogon,40.521891,2.150216,-0.918729,-2.517785,-0.919915,-3.362374,SW
Acrocephalus paludicola,41.894325,-4.745975,-0.098263,1.59662,-0.226017,2.313438,SE


## Outliers:
- 'Aix galericulata'
- 'Aix sponsa'
- 'Alle alle'
- 'Alopochen aegyptiaca'
- 'Anas americana'
- 'Anser brachyrhynchus'
- 'Anser fabalis'
- 'Anthus hodgsoni'
- 'Calandrella rufescens'
- 'Calidris canutus'
- 'Calidris temminckii'
- 'Chlidonias hybrida'
- 'Fulica cristata'
- 'Fulmarus glacialis'
- 'Larus hyperboreus'
- 'Marmaronetta angustirostris'
- 'Morus bassanus'
- 'Passer hispaniolensis'
- 'Phasianus colchicus'
- 'Phoeniconaias minor'
- 'Phylloscopus fuscatus'
- 'Phylloscopus inornatus'
- 'Podiceps grisegena'
- 'Psittacula krameri'
- 'Puffinus gravis'
- 'Stercorarius longicaudus'
- 'Stercorarius skua'
- 'Sterna hirundo'
- 'Sterna paradisaea'
- 'Sylvia conspicillata'
- 'Sylvia curruca'
- 'Tadorna ferruginea'
- 'Threskiornis aethiopicus'
- 'Tringa flavipes'
- 'Xema sabini'

In [28]:
# creating a new dataset with all outliers observations
gbif_out = gbif_r[gbif_r['species'].isin(outliers)]
gbif_out = gbif_out.set_index('species')
gbif_out = gbif_out[['order', 'family', 'genus', 'region', 'lat', 'long', 'eventDate', 'month', 'year']]

In [29]:
# want compass values in our new dataframe 

# create a dict specie-compass
spec_comp = dict(zip(out.index, out.compass))

# use map to apply the dict to each row
gbif_out['compass'] = gbif_out.index.map(spec_comp)

In [30]:
# save dataset with specific observations
gbif_out.to_csv('gbif_out.csv')

In [31]:
# save dataset with mean behaviour
out.to_csv('gbif_out_mean.csv')