## Libraries 

In [1]:
import sys
sys.path.append('../')

In [3]:
import pandas as pd
from functools import partial, reduce
from geopy.geocoders import Nominatim
from sklearn.preprocessing import StandardScaler
import src.functions_cluster as cl

## Objectives

The objective of this data frame is to load the data of the climatic variables that will be used for the clustering model.

This dataset must be prepared to join it with the data of the species that it had downloaded from the GBIF API (Notebook1). After loading the data and putting all the files together, I'm going to create a new column in the dataframe named `community` that will allow us to join this dataframe with the species one

## Load the data

In [4]:
anual_prec = cl.reader("../Predictors/anual_prec.csv", "anual_prec")
elevation = cl.reader("../Predictors/elevation.csv", "elevation")
isothermality = cl.reader("../Predictors/isothermality.csv", "isothermality")
max_warmest = cl.reader("../Predictors/max_warmest.csv", "max_warmest")
mean_temp = cl.reader("../Predictors/mean_temp.csv", "mean_temp")
min_coldest = cl.reader("../Predictors/min_coldest.csv", "min_coldest")
precip_season = cl.reader("../Predictors/precip_season.csv", "precip_season")
temp_range = cl.reader("../Predictors/temp_range.csv", "temp_range")
temp_season = cl.reader("../Predictors/temp_season.csv", "temp_season")

In [None]:
#Uno todos los ficheros independientes en un dataframe. Como todos tienen las mismas coordenadas, la union se hace en base a 
    ## dos columnas
dfs = [min_coldest, elevation, anual_prec, isothermality, max_warmest, mean_temp, precip_season,  temp_season, temp_range]

merge = partial(pd.merge, on=['lon', 'lat'], how='outer')
climate = reduce(merge, dfs)

In [None]:
#creamos una nueva columna que nos junte la lat y long en una nueva columna
climate["new"] = climate["lat"].map(str) + "," + climate["long"].map(str)
climate = climate.drop_duplicates(["long", "lat"])

In [None]:
climate["province"] = climate.new.apply(cl.get_community)

## Exploring the data set and some plots

In [23]:
climate = pd.read_csv("data_cli.csv", sep =  ";")

In [24]:
climate.head()

Unnamed: 0,min_temp,long,lat,max_temp,elevation,isothermality,mean_temp,precip,seasonal_temp,new,community
0,0.4,-5.6,41.9,28.9,704,39.3,12.3,38.7,615.6,"41.9,-5.6",Castilla y Leon
1,0.4,-5.5,41.9,29.1,701,39.2,12.3,37.4,620.7,"41.9,-5.5",Castilla y Leon
2,0.4,-5.4,41.9,29.0,698,39.4,12.3,35.4,618.3,"41.9,-5.4",Castilla y Leon
3,0.2,-5.3,41.9,28.9,718,39.4,12.2,34.3,619.1,"41.9,-5.3",Castilla y Leon
4,0.1,-5.2,41.9,28.8,739,39.4,12.1,34.0,620.6,"41.9,-5.2",Castilla y Leon


In [25]:
climate.drop(["new", "lat", "long"], axis = 1, inplace = True)

In [26]:
climate.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
min_temp,6077.0,2.012572,3.447542,-10.0,-0.3,1.7,4.3,15.1
max_temp,6077.0,29.844068,3.298934,17.7,27.7,29.9,32.5,36.5
elevation,6077.0,609.931545,409.940727,-3.0,272.0,599.0,858.0,2637.0
isothermality,6077.0,40.302337,2.822187,29.6,38.6,40.0,41.8,56.1
mean_temp,6077.0,13.854139,2.72741,1.1,11.9,14.3,15.9,21.0
precip,6077.0,45.450485,14.889985,14.6,33.0,44.5,56.5,92.2
seasonal_temp,6077.0,594.71379,102.104825,192.0,547.4,616.4,661.6,771.7


In [27]:
climate = climate.groupby('community', as_index=False)['min_temp', 'max_temp', 'elevation', 'isothermality', 'mean_temp', 'precip', 'seasonal_temp'].mean()

  climate = climate.groupby('community', as_index=False)['min_temp', 'max_temp', 'elevation', 'isothermality', 'mean_temp', 'precip', 'seasonal_temp'].mean()


In [28]:
climate

Unnamed: 0,community,min_temp,max_temp,elevation,isothermality,mean_temp,precip,seasonal_temp
0,Algarve,6.831667,29.341667,168.0,43.0,16.618333,70.476667,450.486667
1,Andalucia,3.522201,32.681694,469.798442,41.498832,15.922006,60.162025,603.885784
2,Aragon,-0.739964,29.081172,827.101243,39.299112,11.883126,28.45524,646.578153
3,Asturias,2.573077,25.332692,565.346154,40.632692,12.430769,50.832692,489.126923
4,Castilla y Leon,-0.689243,27.802086,932.570801,38.952689,11.116795,35.545774,620.490889
5,Castilla-La Mancha,-0.126208,32.375121,830.71256,39.228744,13.436353,39.4843,701.187077
6,Cataluna,0.32687,27.175069,642.614958,37.685873,12.25928,26.418006,604.93241
7,Comunidad de Madrid,0.270455,30.643182,820.886364,37.052273,13.006818,38.130682,683.940909
8,Comunitat Valenciana,1.987302,30.616667,489.436508,41.11627,14.70754,36.485317,610.363492
9,Euskadi,0.290909,25.663636,696.818182,40.490909,10.9,24.527273,549.681818


In [29]:
#guardo este csv por si las mosquis
climate.to_csv("climate.csv", index = 0)

Now I am going to standardize the data since each of them has different scales. Since we have a categorical variable that we cannot standardize, I am going to make a copy of my dataframe and then standardize only for continuous variables.

In [30]:
scaled_features = climate.copy()

In [31]:
col_names = ['min_temp', 'max_temp', 'elevation', 'isothermality', 'mean_temp', 'precip', 'seasonal_temp']
features = scaled_features[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)

Now, don't create a new dataframe but assign the result to those two columns:

In [32]:
scaled_features[col_names] = features
scaled_features

Unnamed: 0,community,min_temp,max_temp,elevation,isothermality,mean_temp,precip,seasonal_temp
0,Algarve,1.319279,0.259189,-1.565288,1.028701,1.274503,1.65474,-1.056714
1,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245
2,Aragon,-1.103625,0.158462,1.198785,-0.55507,-0.904162,-1.072983,0.909273
3,Asturias,-0.04346,-1.290991,0.101063,0.015627,-0.652192,0.379597,-0.669312
4,Castilla y Leon,-1.087394,-0.336132,1.641093,-0.70332,-1.25675,-0.612718,0.647726
5,Castilla-La Mancha,-0.907225,1.432158,1.21393,-0.585184,-0.189524,-0.357057,1.456775
6,Cataluna,-0.762241,-0.578585,0.425105,-1.245446,-0.731094,-1.205226,0.491738
7,Comunidad de Madrid,-0.780294,0.762456,1.172722,-1.516591,-0.387152,-0.444924,1.283867
8,Comunitat Valenciana,-0.230906,0.752203,-0.217279,0.222571,0.395348,-0.551729,0.54619
9,Euskadi,-0.773748,-1.163023,0.652417,-0.045048,-1.356497,-1.327958,-0.062197


## Load species data 

In [33]:
sp = pd.read_csv("species_def.csv", sep = ";")

In [34]:
sp.head()

Unnamed: 0,long,lat,locality,province,community,year,month,kingdom,class,family,genus,species,common_name
0,-1.856,37.237,Almeria,Almeria,Andalucia,2012,Oct,Animalia,Reptilia,Colubridae,Hemorrhois,Hemorrhois hippocrepis,Horseshoe Whip Snake
1,-2.256,36.797,Almeria,Almeria,Andalucia,2013,Apr,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
2,-2.294,36.835,Almeria,Almeria,Andalucia,2017,Oct,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
3,-2.286,36.82,Almeria,Almeria,Andalucia,2017,Oct,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
4,-2.224,36.763,Almeria,Almeria,Andalucia,2017,Aug,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon


In [17]:
merged = pd.merge(scaled_features, sp, how='inner', left_on='community', right_on='community')

In [18]:
merged.head()

Unnamed: 0,community,min_temp,max_temp,elevation,isothermality,mean_temp,precip,seasonal_temp,long,lat,locality,province,year,month,kingdom,class,family,genus,species,common_name
0,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-1.856,37.237,Almeria,Almeria,2012,Oct,Animalia,Reptilia,Colubridae,Hemorrhois,Hemorrhois hippocrepis,Horseshoe Whip Snake
1,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.256,36.797,Almeria,Almeria,2013,Apr,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
2,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.294,36.835,Almeria,Almeria,2017,Oct,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
3,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.286,36.82,Almeria,Almeria,2017,Oct,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
4,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.224,36.763,Almeria,Almeria,2017,Aug,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon


In [19]:
merged.drop_duplicates()

Unnamed: 0,community,min_temp,max_temp,elevation,isothermality,mean_temp,precip,seasonal_temp,long,lat,locality,province,year,month,kingdom,class,family,genus,species,common_name
0,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-1.856,37.237,Almeria,Almeria,2012,Oct,Animalia,Reptilia,Colubridae,Hemorrhois,Hemorrhois hippocrepis,Horseshoe Whip Snake
1,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.256,36.797,Almeria,Almeria,2013,Apr,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
2,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.294,36.835,Almeria,Almeria,2017,Oct,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
3,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.286,36.820,Almeria,Almeria,2017,Oct,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
4,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.224,36.763,Almeria,Almeria,2017,Aug,Animalia,Reptilia,Chamaeleonidae,Chamaeleo,Chamaeleo chamaeleon,Common Chameleon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9681,Norte,0.124076,-1.132669,-0.237746,0.268741,-0.375814,0.558902,-0.755833,-4.390,43.387,Ria de San Vicente de La Barquera,Cantabria,2004,Jan,Animalia,Aves,Podicipedidae,Podiceps,Podiceps cristatus,Somormujo lavanco
9682,Norte,0.124076,-1.132669,-0.237746,0.268741,-0.375814,0.558902,-0.755833,-4.033,43.434,Ria de Suances,Cantabria,2001,Jan,Animalia,Aves,Podicipedidae,Tachybaptus,Tachybaptus ruficollis,Zampullin comun
9683,Norte,0.124076,-1.132669,-0.237746,0.268741,-0.375814,0.558902,-0.755833,-4.033,43.434,Ria de Suances,Cantabria,2002,Jan,Animalia,Aves,Podicipedidae,Tachybaptus,Tachybaptus ruficollis,Zampullin comun
9684,Norte,0.124076,-1.132669,-0.237746,0.268741,-0.375814,0.558902,-0.755833,-4.033,43.434,Ria de Suances,Cantabria,2003,Jan,Animalia,Aves,Podicipedidae,Podiceps,Podiceps cristatus,Somormujo lavanco


Now, I will clean the dataset to keep only the relevant variables for the cluster analysis

In [20]:
merged.drop(["locality", "province", "year", "month", "kingdom", "class", "family", "genus", "common_name" ], axis = 1,  inplace = True)

In [21]:
merged.head()

Unnamed: 0,community,min_temp,max_temp,elevation,isothermality,mean_temp,precip,seasonal_temp,long,lat,species
0,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-1.856,37.237,Hemorrhois hippocrepis
1,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.256,36.797,Chamaeleo chamaeleon
2,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.294,36.835,Chamaeleo chamaeleon
3,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.286,36.82,Chamaeleo chamaeleon
4,Andalucia,0.260258,1.550703,-0.299635,0.386286,0.954123,0.985189,0.481245,-2.224,36.763,Chamaeleo chamaeleon


In [22]:
merged.to_csv("sp_climate.csv", index = 0)