In [8]:
# imports
import wget
import zipfile
import os
import pandas as pd
import collections
import pickle 

In [9]:
# globals
dataset_url = 'https://hosted-datasets.gbif.org/datasets/ipni.zip'
dataset_dir = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/corpus_taxon/'
output_dir = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-angeline1/ml-herbarium/corpus/corpus_taxon/'
tsv = "Name.tsv"
zip_file = 'ipni.zip'
corpus_taxon = 'corpus_taxon.txt'
duplicates = 'duplicates_taxon.txt'

# Download Dataset

In [10]:
# if os.path.exists(dataset_dir + zip_file):
#     os.remove(dataset_dir + zip_file)
# wget.download(dataset_url, out=dataset_dir + zip_file)

# Extract .tsv from .zip

In [11]:
if os.path.exists(dataset_dir + tsv):
    os.remove(dataset_dir + tsv)
with zipfile.ZipFile(dataset_dir + zip_file,'r') as zip_ref:
    zip_ref.extract(member=tsv, path=dataset_dir)
df = pd.read_csv(dataset_dir + tsv, sep="\t", dtype="object")

In [12]:
df.head()

Unnamed: 0,col:ID,col:basionymID,col:rank,col:scientificName,col:authorship,col:status,col:referenceID,col:publishedInYear,col:publishedInPage,col:publishedInPageLink,col:link,col:remarks
0,77296112-1,,spec.,Acalypha amithii,"Mart.Gord., Fragoso & K.Velasco",,10.11646/phytotaxa.539.3.1,,224,,https://www.ipni.org/n/77296112-1,doi:10.11646/phytotaxa.539.3.1 Etymology: 'The...
1,77297487-1,,subtrib.,subtrib. Lankesteriinae,I.Darbysh. & E.A.Tripp,,10.1002/tax.12600,,140,,https://www.ipni.org/n/77297487-1,doi:10.1002/tax.12600; Type: Lankesteria Lindl.
2,77297486-1,,trib.,trib. Physacantheae,E.A.Tripp & I.Darbysh.,,10.1002/tax.12600,,140,,https://www.ipni.org/n/77297486-1,doi:10.1002/tax.12600; Type: Physacanthus Benth.
3,77297489-1,,subtrib.,subtrib. Tetrameriinae,"T.F.Daniel, Kiel & McDade",,10.1002/tax.12600,,141,,https://www.ipni.org/n/77297489-1,doi:10.1002/tax.12600; Type: Tetramerium Nees.
4,77297488-1,,subtrib.,subtrib. Whitfieldiinae,I.Darbysh. & E.A.Tripp,,10.1002/tax.12600,,140,,https://www.ipni.org/n/77297488-1,doi:10.1002/tax.12600; Type: Whitfieldia Hook.


In [13]:
print(df["col:rank"][0])
print(df["col:scientificName"][0])

spec.
Acalypha amithii


# Export Taxon Name to corpus-taxon.txt

In [14]:
list_taxon = []
for i in range(len(df)):
    if df["col:rank"][i] == "spec.":
        list_taxon.append(df["col:scientificName"][i])

In [15]:
# we need to take care of the duplicates in the dataset 
list_taxon_no_dup = list(set(list_taxon))

print(list_taxon[:10])
print(len(list_taxon))
print(len(list_taxon_no_dup))

['Acalypha amithii', 'Acantholimon akaydinii', 'Actinostephanus enpingensis', 'Aethionema gypsicola', 'Agapetes huangiana', 'Agave rosalesii', 'Allium ducissae', 'Allium elaounii', 'Allium heterophyllum', 'Allium parhamii']
954030
870757


In [16]:
dup = [item for item, count in collections.Counter(list_taxon).items() if count > 1]
# print(collections.Counter(list_taxon).items())
print(len(dup))
print(len(set(dup)))
# print(dup)

67669
67669


In [17]:
# print out the duplicates on duplicates_taxon.txt
# dup = [item for item, count in collections.Counter(list_taxon).items() if count > 1]
with open(output_dir + duplicates, "w") as output_file:
    for i in range(len(dup)):
        output_file.write(dup[i] + '\n')
    output_file.close()

In [18]:
with open(output_dir + duplicates, 'r') as fp:
    x = len(fp.readlines())
    print('Total lines:', x) 

Total lines: 67669


In [19]:
# print out the taxon names without duplicates 
with open(output_dir + corpus_taxon, "w") as output_file:
    dict_possible_species = {}
    dict_possible_genus = {}
    for i in range(len(list_taxon_no_dup)):
        output_file.write(list_taxon_no_dup[i] + '\n')
        genus = list_taxon_no_dup[i].split()[0].lower()
        species = "".join(list_taxon_no_dup[i].split()[1:]).lower()

        # generate a dictionary of possible species for each genus
        if genus not in dict_possible_species:
            dict_possible_species[genus] = [species]
        else:
            # do not include duplicated species 
            if species not in dict_possible_species[genus]:
                dict_possible_species[genus] += [species]

        # generate a dictionary of possible genera for each species
        if species not in dict_possible_genus:
            dict_possible_genus[species] = [genus]
        else:
            # do not include duplicated geneus
            if genus not in dict_possible_genus[species]:
                dict_possible_genus[species] += [genus]
        
    with open(dataset_dir + 'output/possible_species.pkl', 'wb') as f:
        pickle.dump(dict_possible_species, f) 
    with open(dataset_dir + 'output/possible_genus.pkl', 'wb') as filee:
        pickle.dump(dict_possible_genus, filee) 
output_file.close()

In [20]:
with open(output_dir + corpus_taxon, 'r') as fp:
    x = len(fp.readlines())
    print('Total lines:', x)

Total lines: 870757


In [21]:
with open(dataset_dir + 'output/possible_species.pkl', 'rb') as f:
    dict = pickle.load(f)
with open(dataset_dir + 'output/possible_genus.pkl', 'rb') as ff:
    dictt = pickle.load(ff)

In [22]:
print(len(dict))
print(len(dictt))

32602
176682


In [27]:
dictt["boala"]

['diospyros']

In [28]:
dict['rea']

['marginata',
 'macrantha',
 'neriifolia',
 'pinnata',
 'berteroana',
 'pruinata',
 'micrantha',
 'lacerata',
 'longifolia',
 'mollis',
 'leucantha']