In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Dense, Conv2D, MaxPool2D, Dropout, Flatten
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.losses import categorical_crossentropy


In [2]:
test_features = pd.read_csv('./data/test_features.csv')
train_features = pd.read_csv('./data/train_features.csv')
train_labels = pd.read_csv('./data/train_labels.csv')

In [3]:
train_features

Unnamed: 0,id,filepath,site
0,ZJ000000,train_features/ZJ000000.jpg,S0120
1,ZJ000001,train_features/ZJ000001.jpg,S0069
2,ZJ000002,train_features/ZJ000002.jpg,S0009
3,ZJ000003,train_features/ZJ000003.jpg,S0008
4,ZJ000004,train_features/ZJ000004.jpg,S0036
...,...,...,...
16483,ZJ016483,train_features/ZJ016483.jpg,S0093
16484,ZJ016484,train_features/ZJ016484.jpg,S0043
16485,ZJ016485,train_features/ZJ016485.jpg,S0089
16486,ZJ016486,train_features/ZJ016486.jpg,S0095


In [4]:
def merge_animals(df):
    """_summary_

    Args:
        df (_type_): _description_

    Returns:
        _type_: _description_
    """
    df_copy = df.copy() # copy df
    columns = df_copy.columns # get column names
    df_copy['animal_classification'] = np.where(df_copy.values)[1]+1 # add a numeric value to each column
    df_copy.drop(columns ,axis=1, inplace=True) # drop columns that were just combined
    return df_copy
#                ['antelope_duiker', 'bird', 'blank', 'civet_genet', 'hog', 'leopard', 'monkey_prosimian', 'rodent']
# Classification:           1           2        3           4          5        6              7               8

In [5]:
# combine train df's on id
train = pd.merge(left=train_features, right=train_labels, on='id') 

# see function in above cell
train['animal_classification'] = merge_animals(train[['antelope_duiker', 'bird', 'blank', 'civet_genet', 'hog', 'leopard', 'monkey_prosimian', 'rodent']]) 
# done in function above ^ might need 
# train.drop(['antelope_duiker', 'bird', 'blank', 'civet_genet', 'hog', 'leopard', 'monkey_prosimian', 'rodent'] ,axis=1, inplace=True) # drop

# rename numeric observations to actual classifications
train['animal_classification'] = train['animal_classification'].map({1:'antelope_duiker', 2:'bird', 3:'blank', 4:'civet_genet', 5:'hog', 6:'leopard', 7:'monkey_prosimian', 8:'rodent'})

# split file path column to get file names
temp = train['filepath'].str.split(pat='/',expand=True)
# rename split columns
temp.rename(columns={0: 'old_folder_location', 1: 'filename'}, inplace=True)

# concat columns and original df
train = pd.concat([train, temp], axis=1).drop(columns=['filepath'],axis=1)


In [6]:
# finding better site number that are more balanced
print(train[train['site'] == 'S0060']['animal_classification'].value_counts())
print(train[train['site'] == 'S0009']['animal_classification'].value_counts())

civet_genet         959
antelope_duiker      59
monkey_prosimian     49
blank                23
rodent               22
hog                  20
Name: animal_classification, dtype: int64
monkey_prosimian    190
hog                 188
bird                155
blank                57
antelope_duiker      33
rodent               25
civet_genet          16
Name: animal_classification, dtype: int64


In [7]:
# make validation set
validation_set = train[(train['site']=='S0009') | (train['site']=='S0043')| (train['site']=='S0059') |(train['site']== 'S0026')] # get validation set for 2 sites
# make training set
train_set = train[~train.isin(validation_set)].dropna() # remove the observations from train that are in the validation set

In [10]:
print(validation_set['site'].value_counts(normalize=True))
print(train_set['site'].value_counts(normalize=True))

S0009    0.402424
S0043    0.269091
S0059    0.265455
S0026    0.063030
Name: site, dtype: float64
S0060    0.076291
S0063    0.037539
S0008    0.036460
S0036    0.030732
S0038    0.028912
           ...   
S0143    0.000202
S0078    0.000135
S0079    0.000135
S0178    0.000135
S0102    0.000067
Name: site, Length: 144, dtype: float64


In [11]:
# image Gen stuff
# ['antelope_duiker',	'bird',	'blank', 'civet_genet', 'hog', 'leopard', 'monkey_prosimian','rodent']
train_path = './data/train_features_img'
img_gen = ImageDataGenerator()
val_generator = img_gen.flow_from_dataframe(
    validation_set, 
    directory=train_path, 
    x_col='filename', 
    y_col='animal_classification', 
    target_size=(256, 256), 
    class_mode='categorical',
    batch_size=64
)
train_generator = img_gen.flow_from_dataframe(
    train_set, 
    directory=train_path, 
    x_col='filename', 
    y_col='animal_classification', 
    target_size=(256, 256), 
    class_mode='categorical',
    batch_size=64
)

Found 1650 validated image filenames belonging to 8 classes.
Found 14838 validated image filenames belonging to 8 classes.


In [12]:
def plot_metrics(model_fit):
    metrics = ['accuracy', 'precision', 'recall']
    for i in metrics:
        plt.plot(model_fit.history[i], label='Train')
        plt.plot(model_fit.history[f'val_{i}'], label='Test')
        plt.ylabel(i)
        plt.xlabel('Epochs')
        plt.legend()
        plt.show()

In [15]:
model = Sequential()
# layers
model.add(Conv2D(filters=16, kernel_size=(3,3), activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(units=100, activation='relu'))
# output layer
model.add(Dense(units=8, activation='sigmoid'))

2022-05-15 10:28:39.309714: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
model.compile(loss=categorical_crossentropy, optimizer='adam', metrics=['accuracy', 'Recall', 'Precision'])

In [None]:
history = model.fit(
    train_generator,
    batch_size=64,
    epochs=50,
    validation_data=val_generator
)

In [None]:
plot_metrics(history)