In [197]:
import pyspark as ps
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np


In [198]:
species = pd.read_csv('species.csv')


In [199]:
print(species.head())

   Unnamed: 0               Species Name            Common Name          Type  \
0           0        Abies beshanzuensis            Baishan fir  Plant (Tree)   
1           1            Actinote zikani                      -        Insect   
2           2      Aipysurus foliosquama  Leaf scaled sea-snake       Reptile   
3           3    Amanipodagrion gilliesi         Amani flatwing        Insect   
4           4  Antisolabis seychellensis                      -        Insect   

                                       Location(s)  Estimated Population  \
0              Baishanzu Mountain, Zhejiang, China  5 mature individuals   
1          Near São Paulo, Atlantic forest, Brazil               Unknown   
2        Ashmore Reef and Hibernia Reef, Timor Sea               Unknown   
3  Amani-Sigi Forest, Usambara Mountains, Tanzania     < 500 individuals   
4             Morne Blanc, Mahé island, Seychelles               Unknown   

                                             Threats  
0

In [200]:
# change title for for first cell
species.rename(columns = {'Unnamed: 0':'species_id'}, inplace = True)

In [201]:
#print first row of species
print(species.head(1))

   species_id         Species Name  Common Name          Type  \
0           0  Abies beshanzuensis  Baishan fir  Plant (Tree)   

                           Location(s)  Estimated Population  \
0  Baishanzu Mountain, Zhejiang, China  5 mature individuals   

             Threats  
0  Agriculture, fire  


In [202]:
#number of rows

print(species.shape)

(90, 7)


In [203]:
#identify unique threats 
print(species.Threats.unique())

['Agriculture, fire' 'Habitat loss from human expansion'
 'Unknown—probably degradation of coral reef habitat'
 'Population pressure, water pollution' 'Invasive species, climate change'
 'Recreational facilities, water diversion'
 'Competition, predation by Gambusia, road construction'
 'Hunting, cave disturbance' 'Hydropower development'
 'Agricultural development, energy transmission lines'
 'Illegal collection for international pet trade'
 'Chytridiomycosis, logging, agricultural expansion'
 'Agriculture, hunting, fishing, introduced fish'
 'Climate change, oceanographic changes related to the 1982/1983 El Nino'
 'Over-fishing due to value of swim-bladder in traditional medicine'
 'Illegal export to China' 'Forest clearance, overgrazing, development'
 'Habitat loss, competition with livestock, poaching'
 'Disease from commercial bees, habitat destruction and degradation'
 'Large-scale deforestation and logging'
 'Illegal logging of mangrove forests for firewood and construction, hun

In [204]:
# remove rows with threats as nan

species = species.dropna(subset=['Threats'])

In [205]:
print(species.shape)

(89, 7)


In [206]:
# gather all the unique regions
locations = species['Location(s)']

# create a list of all the unique regions
locations_list = []
for location in locations:
    locations_list.append(location.split(', '))



In [207]:
print(locations_list)

[['Baishanzu Mountain', 'Zhejiang', 'China'], ['Near São Paulo', 'Atlantic forest', 'Brazil'], ['Ashmore Reef and Hibernia Reef', 'Timor Sea'], ['Amani-Sigi Forest', 'Usambara Mountains', 'Tanzania'], ['Morne Blanc', 'Mahé island', 'Seychelles'], ['Chapado do Araripe', 'South Ceará', 'Brazil'], ['South-eastern shore of former Lake Aci', 'Turkey'], ['Luplupwintern Cave', 'Western Province', 'Papua NG'], ['Bhutan', 'North East India', 'Myanmar'], ['Rajasthan', 'Gujarat', 'Maharashtra', 'India'], ['Baly Bay region', 'northwestern Madagascar'], ['Azuay', 'Cañar', 'Guyas provinces', 'Ecuador'], ['Volcanic lakes north of Bealanana', 'Madagascar'], ['Unknown'], ['Chinese coast from Yangtze River', 'China to Hong Kong'], ['Bangladesh', 'Cambodia', 'India', 'Indonesia', 'Malaysia'], ['Budini and Lafeti Khola', 'Bhutan'], ['South-east Kenya', 'possibly south-west Somalia'], ['Oregon and California'], ['Atlantic forest', 'south-eastern Brazil'], ['Isla Escudo de Veraguas', 'Panama'], ['Pool on Ga

In [208]:
#shape of locations_list
print(len(locations_list))

89


In [209]:
# append it to the species dataframe new coulumn
species = species.copy()
species['location_list'] = locations_list


In [210]:
species.head()

Unnamed: 0,species_id,Species Name,Common Name,Type,Location(s),Estimated Population,Threats,location_list
0,0,Abies beshanzuensis,Baishan fir,Plant (Tree),"Baishanzu Mountain, Zhejiang, China",5 mature individuals,"Agriculture, fire","[Baishanzu Mountain, Zhejiang, China]"
1,1,Actinote zikani,-,Insect,"Near São Paulo, Atlantic forest, Brazil",Unknown,Habitat loss from human expansion,"[Near São Paulo, Atlantic forest, Brazil]"
2,2,Aipysurus foliosquama,Leaf scaled sea-snake,Reptile,"Ashmore Reef and Hibernia Reef, Timor Sea",Unknown,Unknown—probably degradation of coral reef hab...,"[Ashmore Reef and Hibernia Reef, Timor Sea]"
3,3,Amanipodagrion gilliesi,Amani flatwing,Insect,"Amani-Sigi Forest, Usambara Mountains, Tanzania",< 500 individuals,"Population pressure, water pollution","[Amani-Sigi Forest, Usambara Mountains, Tanzania]"
4,4,Antisolabis seychellensis,-,Insect,"Morne Blanc, Mahé island, Seychelles",Unknown,"Invasive species, climate change","[Morne Blanc, Mahé island, Seychelles]"


In [211]:
# show me the lsit of locations with 1 item in the list
for location in locations_list:
    if len(location) == 1:
        print(location)


['Unknown']
['Oregon and California']
["Côte d'Ivoire"]
['Mauritius']
['Coastal tropical and subtropical waters of Indo-Pacific and Atlantic Oceans']
['Mauritius']
['Northeastern Vietnam']


In [212]:
#drop unknown locations

species = species[species['Location(s)'] != 'Unknown']

In [213]:
for location in locations_list:
    if len(location) == 2:
        print(location)

['Ashmore Reef and Hibernia Reef', 'Timor Sea']
['South-eastern shore of former Lake Aci', 'Turkey']
['Baly Bay region', 'northwestern Madagascar']
['Volcanic lakes north of Bealanana', 'Madagascar']
['Chinese coast from Yangtze River', 'China to Hong Kong']
['Budini and Lafeti Khola', 'Bhutan']
['South-east Kenya', 'possibly south-west Somalia']
['Atlantic forest', 'south-eastern Brazil']
['Isla Escudo de Veraguas', 'Panama']
['Pool on Gavdos', 'Greece']
["Anosibe An'Ala region", 'eastern Madagascar']
['Two small caves on Silhouette and Mahé', 'Seychelles']
['Pembrokeshire', 'United Kingdom']
['Hellshire Hills', 'Jamaica']
['Deciduous forest', 'East Madagascar']
['Kasyoha-Kitomi Forest Reserve', 'Uganda']
['Hula Valley', 'Israel']
['Grand Bassin', 'Mauritius']
['Massif de la Hotte', 'Haiti']
['Pta Molles and Pichidungui', 'Chile']
['Namatimbili-Ngarama Forest', 'Tanzania']
['Witu Forest Reserve', 'Kenya']
['Lake Ohrid', 'Macedonia']
['Liben Plains', 'southern Ethiopia']
['Kalalau Vall

In [214]:
print(species.shape)

(88, 8)


In [215]:
# what are all the different lengths of the location list
lengths = []
for location in locations_list:
    lengths.append(len(location))

print(set(lengths))

{1, 2, 3, 4, 5, 6}


In [216]:
# for each item in the list in location_list, create a column region, city, country
regions = []
cities = []
countries = []

# handle cases with different lengths of location list
for location in locations_list:
    if len(location) == 2:
        regions.append(location[0])
        cities.append('NA')
        countries.append(location[1])
    elif len(location) == 3:
        regions.append(location[0])
        cities.append(location[1])
        countries.append(location[2])
    elif len(location) == 1:
        regions.append('NA')
        cities.append('NA')
        countries.append(location[0])


In [217]:
# species['region'] = regions
# species['city'] = cities
# species['country'] = countries


In [218]:
species.head()

Unnamed: 0,species_id,Species Name,Common Name,Type,Location(s),Estimated Population,Threats,location_list
0,0,Abies beshanzuensis,Baishan fir,Plant (Tree),"Baishanzu Mountain, Zhejiang, China",5 mature individuals,"Agriculture, fire","[Baishanzu Mountain, Zhejiang, China]"
1,1,Actinote zikani,-,Insect,"Near São Paulo, Atlantic forest, Brazil",Unknown,Habitat loss from human expansion,"[Near São Paulo, Atlantic forest, Brazil]"
2,2,Aipysurus foliosquama,Leaf scaled sea-snake,Reptile,"Ashmore Reef and Hibernia Reef, Timor Sea",Unknown,Unknown—probably degradation of coral reef hab...,"[Ashmore Reef and Hibernia Reef, Timor Sea]"
3,3,Amanipodagrion gilliesi,Amani flatwing,Insect,"Amani-Sigi Forest, Usambara Mountains, Tanzania",< 500 individuals,"Population pressure, water pollution","[Amani-Sigi Forest, Usambara Mountains, Tanzania]"
4,4,Antisolabis seychellensis,-,Insect,"Morne Blanc, Mahé island, Seychelles",Unknown,"Invasive species, climate change","[Morne Blanc, Mahé island, Seychelles]"


In [219]:
trainingSet = pd.read_csv('D:\Woking Office\Calgary Hacks\EndangeredAnimal\LivingPlanetIndex_2024_PublicData\LivingPlanetIndex_2024_PublicData\LPD_2024_public.csv', encoding='ISO-8859-1')


In [220]:
trainingSet.head(0)


Unnamed: 0,ID,Binomial,Replicate,Included in LPR2024,Citation,Class,Order,Family,Genus,Species,...,2013,2014,2015,2016,2017,2018,2019,2020,Native,Unnamed: 102


In [221]:

trainingSet.shape

(35996, 103)

In [222]:
#drop irrelevant columns like citation, data provider, etc
df2 = trainingSet.drop(columns=['Binomial','Replicate','Included in LPR2024','Citation', 'Unnamed: 102'])

In [223]:
df2.shape

(35996, 98)

In [224]:
# is there a title called Common Name
df2['Common_name']

0                                Grey-crowned crane
1                                Seychelles warbler
2                           Seychelles magpie-robin
3                           Seychelles magpie-robin
4                                 Mauritius kestrel
                            ...                    
35991    Mountain gorilla / Eastern lowland gorilla
35992                                Chinook salmon
35993                                European bison
35994                            Northern bald ibis
35995                              Ryukyu scops-owl
Name: Common_name, Length: 35996, dtype: object

In [225]:
df2 = df2.drop(columns=['Common_name', 'Subspecies'])

In [226]:
df2.shape

df2.drop

<bound method DataFrame.drop of             ID        Class           Order             Family         Genus  \
0            1         Aves      Gruiformes            Gruidae     Balearica   
1            2         Aves   Passeriformes     Acrocephalidae  Acrocephalus   
2            3         Aves   Passeriformes       Muscicapidae     Copsychus   
3            4         Aves   Passeriformes       Muscicapidae     Copsychus   
4            5         Aves   Falconiformes         Falconidae         Falco   
...        ...          ...             ...                ...           ...   
35991  1000003     Mammalia        Primates          Hominidae       Gorilla   
35992  1000004  Actinopteri   Salmoniformes         Salmonidae  Oncorhynchus   
35993  1000005     Mammalia    Artiodactyla            Bovidae         Bison   
35994  1000006         Aves  Pelecaniformes  Threskiornithidae    Geronticus   
35995  1000007         Aves    Strigiformes          Strigidae          Otus   

       

In [227]:
#drop rows with nan values for Class
df2 = df2.dropna(subset=['Class'])

df2.shape

#find column range for years 1950 to 2020
df2.columns.get_loc('1950')


24

In [228]:
df2.columns.get_loc('2020')

94

In [229]:
df2.shape

(35996, 96)

In [233]:
#interprolate the missing values for columns 1950 to 2020
df2
df2.iloc[:, 24:95] = df2.iloc[:, 24:95].interpolate(method='linear', axis=1)

In [234]:
#index of columns for 1950 to 2020

df2.head()


Unnamed: 0,ID,Class,Order,Family,Genus,Species,Location,Country,All_countries,Region,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Native
0,1,Aves,Gruiformes,Gruidae,Balearica,regulorum,South-western Uganda,Uganda,Uganda,Africa,...,10.8,10.8,10.8,10.8,10.8,10.8,10.8,10.8,10.8,1
1,2,Aves,Passeriformes,Acrocephalidae,Acrocephalus,sechellensis,"Cousin Island, Seychelles",Seychelles,Seychelles,Africa,...,430.0,430.0,430.0,430.0,430.0,430.0,430.0,430.0,430.0,1
2,3,Aves,Passeriformes,Muscicapidae,Copsychus,sechellarum,Seychelles,Seychelles,Seychelles,Africa,...,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,1
3,4,Aves,Passeriformes,Muscicapidae,Copsychus,sechellarum,"Fregate Island, Seychelles",Seychelles,Seychelles,Africa,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,1
4,5,Aves,Falconiformes,Falconidae,Falco,punctatus,"Western Population, Mauritius",Mauritius,Mauritius,Africa,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,1
