In [None]:
import os
import tqdm
import logging
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
import xml.etree.ElementTree as et

from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer

os.environ["CUDA_VISIBLE_DEVICES"]=""
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [None]:
!nvidia-smi

In [None]:
map_df = pd.read_csv('../../../nahouby/Datasets/PlantCLEF/classid_lifeclef2016_to_classid2017.csv', delimiter=';')
map_dict = pd.Series(map_df.ClassId.values ,index=map_df.ClassId_lifeclef2016).to_dict()

In [None]:
len(map_df.ClassId_lifeclef2016), len(map_df.ClassId)

## Parsing 2017 train

In [None]:
header = ['FileName', 'Species', 'Origin', 'Author', 'Content', 'Genus', 'Family', 'ObservationId', 'MediaId', 'YearInCLEF', 'LearnTag', 'ClassId', 'image_path', 'subset']

plantclef2017_train = []
xml_paths = []
for r, d, f in os.walk('../../../nahouby/Datasets/PlantCLEF/PlantCLEF2017/train/'):
    for file in f:
        if '.xml' in file and '._' not in file:
            try:
                try:
                    tree = et.parse(os.path.join(r, file))
                    root = tree.getroot()
                    image_metadata = {}
                    for param in root:
                        image_metadata[param.tag] = param.text
                    image_metadata['image_path'] = os.path.join(r, file.split('.')[0] + '.jpg')
                    image_metadata['subset'] = '2017-train'
                    plantclef2017_train.append(image_metadata)
                except:
                    f = open(os.path.join(r, file), "r", encoding="utf8")
                    metadata = []
                    for i, line in enumerate(f.readlines()):
                        if i > 1:
                            metadata.append(line.split('</')[0].split('>')[-1])
                    metadata = metadata[:-1]
                    metadata.append(os.path.join(r, file.split('.')[0] + '.jpg'))
                    metadata.append('2017-train')
                    plantclef2017_train.append(dict(zip(header, metadata)))
            except:
                print(os.path.join(r, file))
                
plantclef2017_train = pd.DataFrame.from_dict(plantclef2017_train)
plantclef2017_train['ClassId'] = plantclef2017_train['ClassId'].astype(int)

In [None]:
print("# Classes:", len(plantclef2017_train['ClassId'].unique()))
print("# Images:", len(plantclef2017_train))

## Parsing 2017 web

In [None]:
plantclef2017_web = pd.read_csv('../../../nahouby/Datasets/PlantCLEF/PlantCLEF2017TrainWebVeryFinal.csv', sep=';')
len(plantclef2017_web)

In [None]:
label_freq = plantclef2017_train['ClassId'].value_counts().sort_values(ascending=False)

samples = []

for class_id, count in label_freq[label_freq < 50].iteritems():
    sample_size = 50 - count
    
    class_df = plantclef2017_web[plantclef2017_web['ClassId'] == class_id]
    
    if len(class_df) > sample_size:
        sample = class_df.sample(sample_size, random_state=777)
    else:
        sample = class_df.sample(len(class_df), random_state=777)
    
    samples.append(sample)

In [None]:
# 10 - 15719
# 15 - 42202
# 20 - 73383
# 25 - 106741
# 50 - 282622

In [None]:
tmp = pd.concat(samples)

In [None]:
tmp['image_path'] = tmp.apply(lambda x: '/local/nahouby/Datasets/PlantCLEF/PlantCLEF2017/web/' + str(x['ClassId']) + '/' + str(x['MediaId']) + '.jpg', axis=1)

In [None]:
tmp.head(5)

In [None]:
missing_indexes = []
for i, row in tqdm.tqdm(tmp.iterrows(), total = len(tmp)):
    if not os.path.exists(row.image_path):
        missing_indexes.append(i)

In [None]:
plantclef2017_web = tmp.drop(missing_indexes)

## Parsing 2016 test

In [None]:
plantclef2016_test = pd.read_csv('PlantCLEF2016MasterFinalNewDataOnly.csv', delimiter=';')

plantclef2016_test["image_path"] = plantclef2016_test["FileName"].apply(lambda x: '../../../nahouby/Datasets/PlantCLEF/PlantCLEF2016/PlantCLEF2016Test/' + x)
plantclef2016_test = plantclef2016_test.drop(columns=['ImageId2014', 'ObservationId2014'])
plantclef2016_test_filtered = plantclef2016_test[plantclef2016_test["ClassId"].isin(map_df['ClassId_lifeclef2016'])]
plantclef2016_test_filtered["ClassId"] = plantclef2016_test_filtered["ClassId"].apply(lambda x: map_dict[x])
plantclef2016_test_filtered = plantclef2016_test_filtered[plantclef2016_test_filtered["ClassId"].isin(plantclef2017_train['ClassId'].unique())]

print(len(plantclef2016_test), len(plantclef2016_test_filtered))
print("# Classes:", len(plantclef2016_test_filtered['ClassId'].unique()))
print("# Images:", len(plantclef2016_test_filtered))

## Parsing 2016 train

In [None]:
plantclef2016_train = pd.read_csv('PlantCLEF2015MasterFinal.csv', delimiter=';')

plantclef2016_train["image_path"] = plantclef2016_train["FileName"].apply(lambda x: '../../nahouby/Datasets/PlantCLEF/PlantCLEF2015/traintest/' + x)
plantclef2016_train = plantclef2016_train.drop(columns=['ImageId2014', 'ObservationId2014'])
plantclef2016_train_filtered = plantclef2016_train[plantclef2016_train["ClassId"].isin(map_df['ClassId_lifeclef2016'])]
plantclef2016_train_filtered["ClassId"] = plantclef2016_train_filtered["ClassId"].apply(lambda x: map_dict[x])
plantclef2016_train_filtered = plantclef2016_train_filtered[plantclef2016_train_filtered["ClassId"].isin(plantclef2017_train['ClassId'].unique())]

print(len(plantclef2016_train), len(plantclef2016_train_filtered))
print("# Classes:", len(plantclef2016_train_filtered['ClassId'].unique()))
print("# Images:", len(plantclef2016_train_filtered))

## Merge 2016 data 

In [None]:
plantclef2016 = pd.concat([plantclef2016_test_filtered, plantclef2016_train_filtered], ignore_index=True, sort=False)
print("# Classes:", len(plantclef2016['ClassId'].unique()))
print("# Images:", len(plantclef2016))
plantclef2016 = plantclef2016.drop(columns=['Unnamed: 21'])

plantclef2016 = plantclef2016.rename(columns={"Year": "YearInCLEF", "tags": "Content", "origin": "Origin", "family": "Family", "genus": "Genus", "species": "Species", "author": "Author"})
plantclef2016.head(1)

In [None]:
len(plantclef2016_test_filtered), len(plantclef2016_train_filtered), len(plantclef2017_train)

In [None]:
all_metadata = pd.concat([plantclef2017_train, plantclef2016], ignore_index=True, sort=False)
all_metadata.head(3)

In [None]:
print("# Classes:", len(all_metadata['ClassId'].unique()))
print("# Images:", len(all_metadata))

## Parsing 2017 test

In [None]:
plantclef2017_test_metadata = pd.read_csv('PlantCLEF2017OnlyTest.csv', delimiter=';') # names=['ObservationId', 'ClassId', 'set'])

In [None]:
plantclef2017_test_metadata['image_path'] = plantclef2017_test_metadata.apply(lambda row: '/local/nahouby/Datasets/PlantCLEF/PlantCLEF2017/test/' + str(row.MediaId) + '.jpg', axis=1)

In [None]:
plantclef2017_test_metadata['class_id'] = [ClassId_2_class_id[int(ClassID)] for ClassID in plantclef2017_test_metadata['ClassId']]

In [None]:
KO_indexes = []
for index, row in plantclef2017_test_metadata.iterrows():
    try:
        Image.open(row.image_path)
    except:
        print(row.image_path)
        KO_indexes.append(index)

In [None]:
plantclef2017_test_metadata = plantclef2017_test_metadata.drop(KO_indexes).reset_index().drop(columns=['index'])

In [None]:
plantclef2017_test_metadata

In [None]:
print("# Classes:", len(plantclef2017_test_metadata['ClassId'].unique()))
print("# Images:", len(plantclef2017_test_metadata))
print("# Observations:", len(plantclef2017_test_metadata['ObservationId'].unique()))

In [None]:
plantclef2017_test_metadata.to_csv("PlantCLEF2017_test_metadata.csv")

## Metadata

In [None]:
# Bar plot
label_freq = all_metadata['Species'].value_counts().sort_values(ascending=False)

style.use("fivethirtyeight")
plt.figure(figsize=(24,100))
sns.barplot(y=label_freq.index.values, x=label_freq, order=label_freq.index)
plt.title("Label frequency", fontsize=12)
plt.xlabel("")
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
plt.xlim(0, 25)
plt.show()
print("Number of Species in dataset: {}\n".format(len(label_freq)))

In [None]:
# Bar plot
label_freq = all_metadata['Genus'].value_counts().sort_values(ascending=False)

style.use("fivethirtyeight")
plt.figure(figsize=(24,20))
sns.barplot(y=label_freq.index.values, x=label_freq, order=label_freq.index)
plt.title("Label frequency", fontsize=12)
plt.xlabel("")
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
plt.show()
print("Number of Genus in dataset: {}\n".format(len(label_freq)))

In [None]:
# Bar plot
label_freq = all_metadata['Family'].value_counts().sort_values(ascending=False)

style.use("fivethirtyeight")
plt.figure(figsize=(24,20))
sns.barplot(y=label_freq.index.values, x=label_freq, order=label_freq.index)
plt.title("Label frequency", fontsize=12)
plt.xlabel("")
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
plt.show()
print("Number of Family in dataset: {}\n".format(len(label_freq)))

In [None]:
from sklearn.preprocessing import LabelBinarizer

print("\n Species Labels:")
lb_species = LabelBinarizer()
lb_species.fit(np.asarray(all_metadata['Species']))
# Loop over all labels and show them
N_LABELS = len(lb_species.classes_)
#for (i, label) in enumerate(lb_species.classes_):
#    print("{}. {}".format(i, label))

print("\n Genus Labels:")
lb_genus = LabelBinarizer()
lb_genus.fit(np.asarray(all_metadata['Genus']))
# Loop over all labels and show them
N_LABELS = len(lb_genus.classes_)
#for (i, label) in enumerate(lb_genus.classes_):
#    print("{}. {}".format(i, label))
    
print("\n Family Labels:")
lb_family = LabelBinarizer()
lb_family.fit(np.asarray(all_metadata['Family']))
# Loop over all labels and show them
N_LABELS = len(lb_family.classes_)
#for (i, label) in enumerate(lb_family.classes_):
#    print("{}. {}".format(i, label))

In [None]:
all_metadata['class_id'] = all_metadata.apply(lambda row: np.where(lb_species.classes_ == row['Species'])[0][0], axis=1)
all_metadata['family_id'] = all_metadata.apply(lambda row: np.where(lb_family.classes_ == row['Family'])[0][0], axis=1)
all_metadata['genus_id'] = all_metadata.apply(lambda row: np.where(lb_genus.classes_ == row['Genus'])[0][0], axis=1)

all_metadata.head(4)

In [None]:
plantclef2017_web['class_id'] = plantclef2017_web.apply(lambda row: np.where(lb_species.classes_ == row['Species'])[0][0], axis=1)
plantclef2017_web['family_id'] = plantclef2017_web.apply(lambda row: np.where(lb_family.classes_ == row['Family'])[0][0], axis=1)
plantclef2017_web['genus_id'] = plantclef2017_web.apply(lambda row: np.where(lb_genus.classes_ == row['Genus'])[0][0], axis=1)

plantclef2017_web.head(4)

In [None]:
train_metadata, val_metadata = train_test_split(all_metadata, test_size=0.1, random_state=1, stratify=all_metadata['ClassId'])

In [None]:
len(train_metadata), len(val_metadata), len(plantclef2017_web)

In [None]:
missing_val = set(train_metadata['ClassId'].unique()) - set(val_metadata['ClassId'].unique())
len(missing_val)

In [None]:
missing_val_samples = train_metadata[train_metadata['ClassId'].isin(missing_val)]
missing_val_samples = missing_val_samples.groupby('ClassId', as_index=False).sample(1)
remove_from_train = missing_val_samples.index
all_val_metadata = pd.concat([val_metadata, missing_val_samples], ignore_index=True, sort=False)

In [None]:
train_metadata_v2 = train_metadata.drop(remove_from_train)
train_metadata_v2 = train_metadata_v2.reset_index()

In [None]:
print("# Classes:", len(train_metadata_v2['ClassId'].unique()))
print("# Images:", len(train_metadata_v2))

print("# Classes:", len(all_val_metadata['ClassId'].unique()))
print("# Images:", len(all_val_metadata))

In [None]:
# Overlap test!
len(set(all_val_metadata['MediaId']) - set(train_metadata_v2['MediaId'])), len(set(train_metadata_v2['MediaId']) - set(all_val_metadata['MediaId']))

In [None]:
print("Final number of image samples:", len(all_val_metadata))

all_val_metadata.to_csv("PlantCLEF2018_val_metadata.csv")

In [None]:
print("Final number of image smaples:", len(train_metadata_v2))

train_metadata_v2.to_csv("PlantCLEF2018_train_metadata.csv")

In [None]:
print("Final number of web smaples:", len(plantclef2017_web))

train_metadata_v2.to_csv("PlantCLEF2018_web_metadata.csv")

In [None]:
train_metadata_v2[train_metadata_v2['image_path'].isnull()]

In [None]:
all_val_metadata[all_val_metadata['image_path'].isnull()]

## Parsing 2018 test

In [None]:
map_df = pd.read_csv('../../nahouby/Datasets/PlantCLEF/PlantCLEF2018/gt_file.csv', delimiter=';', names=['ObservationId', 'ClassId', 'set'])
map_df

In [None]:
observation_2_ClassId = dict(zip(map_df.ObservationId, map_df.ClassId))
ClassId_2_class_id = dict(zip(train_metadata_v2.ClassId, train_metadata_v2.class_id))

In [None]:
plantclef2018_test = []
xml_paths = []
for r, d, f in os.walk('../../nahouby/Datasets/PlantCLEF/PlantCLEF2018/test/'):
    for file in f:
        if '.xml' in file and '._' not in file:
            #try:
                
                tree = et.parse(os.path.join(r, file))
                root = tree.getroot()
                image_metadata = {}
                observation_id = None
                for param in root:
                    image_metadata[param.tag] = param.text
                    if param.tag == 'ObservationId':
                        observation_id = param.text  
                image_metadata['image_path'] = os.path.join(r, file.split('.')[0] + '.jpg')
                image_metadata['ClassId'] = observation_2_ClassId[int(observation_id)]
                image_metadata['class_id'] = ClassId_2_class_id[observation_2_ClassId[int(observation_id)]]
                plantclef2018_test.append(image_metadata)
            #except:
                #print(os.path.join(r, file))
                
plantclef2018_test = pd.DataFrame.from_dict(plantclef2018_test)

In [None]:
print("# Classes:", len(plantclef2018_test['ClassId'].unique()))
print("# Images:", len(plantclef2018_test))

In [None]:
# Bar plot
label_freq = plantclef2018_test['class_id'].astype(str).value_counts().sort_values(ascending=False)

style.use("fivethirtyeight")
plt.figure(figsize=(24,200))
sns.barplot(y=label_freq.index.values, x=label_freq, order=label_freq.index)
plt.title("Label frequency", fontsize=12)
plt.xlabel("")
plt.xticks(fontsize=24)
plt.yticks(fontsize=12)
#plt.xlim(0, 25)
plt.show()
print("Number of Species in test dataset: {}\n".format(len(label_freq)))

In [None]:
plantclef2018_test.to_csv("PlantCLEF2018_test_metadata.csv")

# 2018 Train

In [None]:
plantclef2018_train = []
import glob
images = glob.glob('/mnt/datagrid/plants/plants/PlantCLEF/2017_web_from_tfrecord/*/*.*')

In [None]:
ClassId_2_class_id = dict(zip(train_metadata_v2.ClassId, train_metadata_v2.class_id))

In [None]:
for image in images:
    
    img_name = image.split('/')[-1]
    ClassId = image.split('/')[-2]
    path = image

    plantclef2018_train.append({'img_name': img_name,
                               'ClassId': ClassId,
                                'class_id': ClassId_2_class_id[int(ClassId)],
                               'image_path': path})

In [None]:
plantclef2018_train = pd.DataFrame(plantclef2018_train)
len(plantclef2018_train.class_id.unique()), len(plantclef2018_train)

In [None]:
label_freq = all_metadata['class_id'].value_counts().sort_values(ascending=False)

lbls20 = label_freq[label_freq <= 19]
print(len(lbls20))

In [None]:
len(plantclef2018_train.class_id.unique()), plantclef2018_train.class_id.unique()

In [None]:
plantclef2018_train_subset = plantclef2018_train[plantclef2018_train['class_id'].isin(lbls20.index)]

print(len(plantclef2018_train_subset))

In [None]:
print(len(plantclef2018_train_subset.class_id.unique()))

In [None]:
# Bar plot
label_freq = plantclef2018_train_subset['class_id'].astype(str).value_counts().sort_values(ascending=False)

style.use("fivethirtyeight")
plt.figure(figsize=(24,200))
sns.barplot(y=label_freq.index.values, x=label_freq, order=label_freq.index)
plt.title("Label frequency", fontsize=12)
plt.xlabel("")
plt.xticks(fontsize=24)
plt.yticks(fontsize=12)
#plt.xlim(0, 25)
plt.show()
print("Number of Species in test dataset: {}\n".format(len(label_freq)))

In [None]:
metadata_all = pd.concat([all_val_metadata, train_metadata_v2], ignore_index=True, sort=False)

In [None]:
plantclef2018_train

In [None]:
plantclef2018_test[plantclef2018_test['MediaId'] == str(2876963)]