In [118]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
import numpy as np
import collections

In [2]:
gbif_r = pd.read_csv('gbif_r.csv')

In [3]:
# New dataframe with the lat-long mean and median per year and specie
dif = pd.DataFrame(gbif_r.groupby(['year', 'species'])['lat'].mean())
dif = dif.rename(columns = {'lat': 'lat_mean'})
dif['long_mean'] = gbif_r.groupby(['year', 'species'])['long'].mean()
dif['long_med'] = gbif_r.groupby(['year', 'species'])['long'].median()
dif['lat_med'] = gbif_r.groupby(['year', 'species'])['lat'].median()
dif['regions'] = gbif_r.groupby(['year', 'species'])['region'].agg(lambda x:x.value_counts().index[0])

dif.reset_index(inplace = True)  

In [23]:
# create an empty dataframe
new = pd.DataFrame()

# select species in common in both study years
species20 = list(dif.species[dif['year'] == 2020])
species01 = list(dif.species[dif['year'] == 2001])

spec = list(set(species01).intersection(species20))

# sort the common species
spec.sort()

# set species as the index in our dataframe
new = pd.DataFrame(index=spec)


# lat_mean
for s in new.index:
    dif_spec = dif[dif["species"] == s]

    aux_y1 = dif_spec.lat_mean[dif_spec['year'] == 2001].iloc[0]
    aux_y2 = dif_spec.lat_mean[dif_spec['year'] == 2020].iloc[0]

    new.loc[s, 'lat_mean_01-20'] = aux_y2 - aux_y1

# long_mean
for s in new.index:
    dif_spec = dif[dif["species"] == s]

    aux_y1 = dif_spec.long_mean[dif_spec['year'] == 2001].iloc[0]
    aux_y2 = dif_spec.long_mean[dif_spec['year'] == 2020].iloc[0]

    new.loc[s, 'long_mean_01-20'] = aux_y2 - aux_y1

# lat_med
for s in new.index:
    dif_spec = dif[dif["species"] == s]

    aux_y1 = dif_spec.lat_med[dif_spec['year'] == 2001].iloc[0]
    aux_y2 = dif_spec.lat_med[dif_spec['year'] == 2020].iloc[0]

    new.loc[s, 'lat_med_01-20'] = aux_y2 - aux_y1

# long_med
for s in new.index:
    dif_spec = dif[dif["species"] == s]

    aux_y1 = dif_spec.long_med[dif_spec['year'] == 2001].iloc[0]
    aux_y2 = dif_spec.long_med[dif_spec['year'] == 2020].iloc[0]

    new.loc[s, 'long_med_01-20'] = aux_y2 - aux_y1

new

Unnamed: 0,lat_mean_01-20,long_mean_01-20,lat_med_01-20,long_med_01-20
Accipiter gentilis,-0.363295,0.997845,-0.391531,1.139175
Accipiter nisus,0.050245,0.670222,-0.310430,1.294906
Acrocephalus arundinaceus,-0.485551,1.106905,-1.375074,2.313438
Acrocephalus melanopogon,-0.918729,-2.517785,-0.919915,-3.362374
Acrocephalus paludicola,-0.098263,1.596620,-0.226017,2.313438
...,...,...,...,...
Upupa epops,1.098976,1.486455,-0.051227,1.557742
Uria aalge,-0.059705,1.141408,-0.028250,2.233221
Vanellus vanellus,-0.925168,0.322991,-1.666814,0.109112
Xema sabini,-0.072182,-1.048437,-0.474348,-4.535075


In [84]:
# look for outliers in mean latitude
mean_lat = new['lat_mean_01-20'].sort_values()
mean_lat.head(10)

# look for outliers in mean longitude
mean_long = new['long_mean_01-20'].sort_values()
mean_long.head(10)

# look for outliers in median latitude
med_lat = new['lat_med_01-20'].sort_values()
med_lat.head(10)

# look for outliers in median longitude
med_long = new['long_med_01-20'].sort_values()
med_long.head(10)

Index(['Anthus hodgsoni', 'Phoeniconaias minor'], dtype='object')

In [119]:
# creating a unique list with all outlier species

# 2 values up to 4 in mean_lat
mean_lat_list = mean_lat[:2]
mean_lat_list = mean_lat_list.index.values.tolist()

# 7 values up to 4 in mean_long
mean_long_list = mean_long[:7]
mean_long_list = mean_long_list.index.values.tolist()

# 4 values up to 4 in med_lat
med_lat_list = med_lat[:4]
med_lat_list = med_lat_list.index.values.tolist()

# 13 values up to 4 in med_long
med_long_list = med_long[:13]
med_long_list = med_long_list.index.values.tolist()

# unique list 
outlier = mean_lat_list + mean_long_list + med_lat_list + med_long_list

['Anthus hodgsoni', 'Phoeniconaias minor', 'Sylvia curruca', 'Alopochen aegyptiaca', 'Phylloscopus inornatus', 'Threskiornis aethiopicus', 'Anas americana', 'Aix sponsa', 'Phylloscopus fuscatus']


In [None]:
# unique list without duplicates
outliers = set(outlier)
outliers = list(outliers)
print(outliers)

In [124]:
# list of species repeated in mean and median change-distribution lists
re_species = [item for item, count in collections.Counter(outlier).items() if count > 1]
print(re_species)

['Anthus hodgsoni', 'Phoeniconaias minor', 'Sylvia curruca', 'Alopochen aegyptiaca', 'Phylloscopus inornatus', 'Threskiornis aethiopicus', 'Anas americana', 'Aix sponsa', 'Phylloscopus fuscatus']


In [114]:
# There is not a specie that changes long-lat at the same time -- mean
outs = []
for o in mean_lat_list:
    if o in mean_long_list:
        outs.append(o)
outs

[]

In [115]:
# There is not a specie that changes long-lat at the same time -- median
outs = []
for o in med_lat_list:
    if o in med_long_list:
        outs.append(o)
outs

[]

In [128]:
print(re_species)
# Stercorarius skua  
# Phasianus colchicus 

['Anthus hodgsoni', 'Phoeniconaias minor', 'Sylvia curruca', 'Alopochen aegyptiaca', 'Phylloscopus inornatus', 'Threskiornis aethiopicus', 'Anas americana', 'Aix sponsa', 'Phylloscopus fuscatus']


In [None]:
Anthus hodgsoni - migratoria - passeriforme 
Phoeniconaias minor - migratoria - phoenicopteriformes - poco amenazado
Sylvia curruca - migratoria - passeriforme - 
Alopochen aegyptiaca - nativa ahora - anseriformes
Phylloscopus inornatus - migratoria - passeriforme 
Threskiornis aethiopicus - migratoria - pelecaniformes 
Anas americana - migratoria - anseriformes 
Aix sponsa - migratoria - anseriformes 
Phylloscopus fuscatus - migratoria - passeriforme 
Stercorarius skua - invernante - charadriiformes
Phasianus colchicus - introducida - galliformes

GUARDAR ESTE CÓDIGO

In [121]:
### AÑOS BUCLE  -- retorna 190 columnas solo para las combinaciones de lat_mean
new = pd.DataFrame()

species20 = list(dif.species[dif['year'] == 2020])
species01 = list(dif.species[dif['year'] == 2001])

spec = list(set(species01).intersection(species20))
spec.sort()
new = pd.DataFrame(index=spec)

years = dif["year"].unique()
years_perm = list(combinations(years, 2))

for s in new.index:
    for y1, y2 in years_perm:
        # get only curren specie
        dif_spec = dif[dif["species"] == s]

        # get lat mean for current specie and years 1 and 2 respectively
        aux_y1 = dif_spec.lat_mean[dif_spec['year'] == y1]
        aux_y2 = dif_spec.lat_mean[dif_spec['year'] == y2]

        # check that current specie has any value in the observed years
        if not(aux_y1.empty or aux_y2.empty):  # if not(aux_y1.empty) and not(aux_y2.empty):
            diff_s = aux_y2.iloc[0] - aux_y1.iloc[0]

        new.loc[s, 'lat_mean_{}-{}'.format(y1, y2)] = diff_s

print(new.head())

KeyboardInterrupt: 

# A PARTIR DE AQUÍ TODO SON PRUEBAS

plt.figure(figsize = (13,3))
sns.kdeplot(x = gbif_r.year, color = 'r');

plt.figure(figsize = (15,12))
plot_order = sns.histplot(y = gbif_r.region, 
                          hue = gbif_r.order,
                           multiple = "fill",
                         palette = 'Paired');

plot_order.set_title('Bird distribution by order and regions')
plot_order.set(xlabel = 'Order', 
                ylabel='Region');

funcion que compare de cada especie-año la long /lat ... y me devuelva una lista de
aquellas especies que tengan una diferencia de valor mayor a X 
o que me ordene la diferencia de valor encontrada, y yo hacer un sort y trabajar con las mayores diferencias


# función base con Sonia
nombrecolumna = list(dif.columns)
species = list(dif.species.unique()) 
def compare(columnname, startyear, endyear):
    lista = []
    for s in species:
        a = dif.loc[(dif['species'] == s) & (dif['year'] == startyear)]
        b = dif.loc[(dif['species'] == s) & (dif['year'] == endyear)]
        c = b - a
        lista.append({s: c})
        
    return lista 

### buscar los valores más altos. ABS!!! 

# revisar

def plot_df(df, x, y, title="", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi = dpi)
    plt.plot(x, y, color='tab:red')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

plot_df(df = gbif_r, 
        x = gbif_r.date, 
        y = gbif_r.observations, 
        title = 'Bird ')    

In [None]:
# Checking changes in the geographic distribution of species
mobility = []
for s in dif.species:
    # subsetear para specie unica 
    #nunique = len + unique?!
    a = dif[dif.species == s]
    if len(a.regions.unique()) != 1:
        mobility.append(s)
    else:
        pass
print(set(mobility))


In [22]:
mobility = []
species_unique = dif.species.unique()
for s in species_unique:
    
   # subsetear para specie unica
   # nunique = len + unique?!
    dif_spec = dif[dif["species"] == s]
        if dif_spec.regions.nunique() > 1:
            mobility.append(s)
        else:
            pass
print(len(mobility))

###################

        # get lat mean for current specie and years 1 and 2 respectively
        aux_y1 = dif_spec.lat_mean[dif_spec['year'] == y1]
        aux_y2 = dif_spec.lat_mean[dif_spec['year'] == y2]

        # check that current specie has any value in the observed years
        if not(aux_y1.empty or aux_y2.empty):  # if not(aux_y1.empty) and not(aux_y2.empty):
            diff_s = aux_y2.iloc[0] - aux_y1.iloc[0]

        new.loc[s, 'lat_mean_{}-{}'.format(y1, y2)] = diff_s

3269


In [19]:
type(dif_spec)

pandas.core.frame.DataFrame

In [None]:
#  Usar Poisson para describir el número de ocurrencias en un determinado espacio de tiempo