In this notebook we update the csv used for defining the training set with:
* we remove the Monkeys from Compressed images folder as they are a mixture of Gorillas, Chimps etc.
* we incorporate the extra_data from Robbie: the ~1200 Chimps and Gorillas (warning 3 images are labelled as both - so we include them in neither category

In [22]:
import pandas as pd
from pathlib import Path
import shutil

In [15]:
path_chimps = Path("gabon_extra_data/chimpanzee/")
path_gorillas = Path("gabon_extra_data/gorilla/")

In [3]:
train_df = pd.read_csv("gabon_wildlife_training/inspect_data_split_validation/train_valid_df.csv")
train_df

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder
0,0000005.jpg,Human,Compressed Camera Trap Images,False,T33
1,0000007.jpg,Human,Compressed Camera Trap Images,False,T33
2,0000009.jpg,Human,Compressed Camera Trap Images,False,T33
3,0000013.jpg,Human,Compressed Camera Trap Images,False,T33
4,0000015.jpg,Human,Compressed Camera Trap Images,False,T33
...,...,...,...,...,...
347115,1529566.jpg,Duiker_Red,allData,False,NouabaleTEAM
347116,1529571.jpg,Duiker_Red,allData,False,NouabaleTEAM
347117,1529572.jpg,Duiker_Red,allData,False,NouabaleTEAM
347118,1529575.jpg,Duiker_Red,allData,False,NouabaleTEAM


In [13]:
new_chimps = [file.name for file in path_chimps.glob("*")]
len(new_chimps)

987

In [16]:
new_gorillas = [file.name for file in path_gorillas.glob("*")]
len(new_gorillas)

232

In [18]:
set(train_df.uniqueName).intersection(set(new_chimps))

set()

In [19]:
set(train_df.uniqueName).intersection(set(new_gorillas))

set()

In [23]:
for name in set(new_chimps).intersection(set(new_gorillas)):
    print(name)
    shutil.copyfile(path_chimps / name, name)

I0337068.JPG
J0460215.JPG
F0604107.JPG


**NOTE:** So those three images are labelled both as a Gorilla and as a Chimpanzee - not addind them to the csv, but will add them to the disks.

In [24]:
mislabels = list(set(new_chimps).intersection(set(new_gorillas)))
mislabels

['I0337068.JPG', 'J0460215.JPG', 'F0604107.JPG']

In [25]:
unique_new_gorillas = [file for file in new_gorillas if file not in mislabels]
unique_new_chimps = [file for file in new_chimps if file not in mislabels]
print(len(unique_new_chimps))
print(len(unique_new_gorillas))

984
229


In [26]:
train_df.head()

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder
0,0000005.jpg,Human,Compressed Camera Trap Images,False,T33
1,0000007.jpg,Human,Compressed Camera Trap Images,False,T33
2,0000009.jpg,Human,Compressed Camera Trap Images,False,T33
3,0000013.jpg,Human,Compressed Camera Trap Images,False,T33
4,0000015.jpg,Human,Compressed Camera Trap Images,False,T33


In [33]:
chimp_df = pd.DataFrame(unique_new_chimps, columns=["uniqueName"])
chimp_df["species"] = "Chimpanzee"
chimp_df["main_folder"] = "extra_data"
chimp_df["sub_folder"] = "not_specified"
chimp_df["is_valid"] = False

chimp_df

Unnamed: 0,uniqueName,species,main_folder,sub_folder,is_valid
0,D0526142.JPG,Chimpanzee,extra_data,not_specified,False
1,H1848415.JPG,Chimpanzee,extra_data,not_specified,False
2,J1445261.JPG,Chimpanzee,extra_data,not_specified,False
3,I1048731.JPG,Chimpanzee,extra_data,not_specified,False
4,J3141639.JPG,Chimpanzee,extra_data,not_specified,False
...,...,...,...,...,...
979,I1759174.JPG,Chimpanzee,extra_data,not_specified,False
980,J0736165.JPG,Chimpanzee,extra_data,not_specified,False
981,C0831269.JPG,Chimpanzee,extra_data,not_specified,False
982,B2562697.JPG,Chimpanzee,extra_data,not_specified,False


In [34]:
gorilla_df = pd.DataFrame(unique_new_gorillas, columns=["uniqueName"])
gorilla_df["species"] = "Gorilla"
gorilla_df["main_folder"] = "extra_data"
gorilla_df["sub_folder"] = "not_specified"
gorilla_df["is_valid"] = False

gorilla_df

Unnamed: 0,uniqueName,species,main_folder,sub_folder,is_valid
0,J3149667.JPG,Gorilla,extra_data,not_specified,False
1,G1658885.JPG,Gorilla,extra_data,not_specified,False
2,L0454020.JPG,Gorilla,extra_data,not_specified,False
3,H1234108.JPG,Gorilla,extra_data,not_specified,False
4,D2278316.JPG,Gorilla,extra_data,not_specified,False
...,...,...,...,...,...
224,K0758088.JPG,Gorilla,extra_data,not_specified,False
225,F2451519.JPG,Gorilla,extra_data,not_specified,False
226,J2559677.JPG,Gorilla,extra_data,not_specified,False
227,I2831434.JPG,Gorilla,extra_data,not_specified,False


In [28]:
train_df.species.value_counts()

Human                     35000
Duiker_Blue               35000
Elephant_African          35000
Blank                     35000
Duiker_Red                35000
Rat_Giant                 34227
Hog_Red_River             26069
Duiker_Yellow_Backed      15436
Porcupine_Brush_Tailed    13694
Guineafowl_Black          12055
Squirrel                  11349
Mandrillus                 9305
Bird                       6649
Buffalo_African            6459
Mongoose                   5518
Chimpanzee                 5260
Monkey                     5038
Chevrotain_Water           4713
Genet                      3303
Rail_Nkulengu              3269
Leopard_African            2142
Guineafowl_Crested         1513
Mongoose_Black_Footed      1475
Rodent                     1358
Gorilla                    1074
Civet_African_Palm          936
Pangolin                    872
Cat_Golden                  406
Name: species, dtype: int64

# Analyze train_df 

In [44]:
no_compressed_monkey_train_df = train_df[(train_df.main_folder != "Compressed Camera Trap Images") |
                                         ((train_df.main_folder == "Compressed Camera Trap Images") &
                                          (train_df.species != "Monkey"))]

# Merge

In [47]:
df = pd.concat([no_compressed_monkey_train_df,chimp_df,gorilla_df], ignore_index=True)
df

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder
0,0000005.jpg,Human,Compressed Camera Trap Images,False,T33
1,0000007.jpg,Human,Compressed Camera Trap Images,False,T33
2,0000009.jpg,Human,Compressed Camera Trap Images,False,T33
3,0000013.jpg,Human,Compressed Camera Trap Images,False,T33
4,0000015.jpg,Human,Compressed Camera Trap Images,False,T33
...,...,...,...,...,...
348149,K0758088.JPG,Gorilla,extra_data,False,not_specified
348150,F2451519.JPG,Gorilla,extra_data,False,not_specified
348151,J2559677.JPG,Gorilla,extra_data,False,not_specified
348152,I2831434.JPG,Gorilla,extra_data,False,not_specified


In [50]:
df.to_csv("gabon_extra_data/train_valid_df_200722.csv", index=False)

In [49]:
len(chimp_df) + len(gorilla_df)

1213