# TEAM ECOLOGISTS
## Open CV features extraction from image files(image paths) and image pixels csv

### Imports

In [1]:
import cv2
import numpy as np
import scipy
from scipy import spatial
#from scipy.misc import imread
import matplotlib.image as mpimg
import pickle
import random
import os
import matplotlib.pyplot as plt
import pandas as pd

### Feature Extractor Class
<div style='background-color:#FD7575; padding:3px;'>
    <h3>Don't change</h3>
</div>

In [2]:
# Feature extractor
def extract_features(image_path, is_path=True, vector_size=32):
    
    #image = imread(image_path, mode="RGB")
    
    if not is_path:
        print("Reading image from pixel")
        
        #image_path is a row of DataFrame here
        #change the image pixels below if these are change in the csv
        img = image_path.values[:-1].reshape(64,64,3)
        #Mohammed the following line will store image in image variable
        image = (img.astype(float) / 255)
        
    else:
#         print("Reading image from path")
        #in MAC OS uncomment the next line
        #image_path = image_path.replace("\\", "/")
        image = mpimg.imread(image_path)
    
    
    try:
        # Using KAZE, cause SIFT, ORB and other was moved to additional module
        # which is adding addtional pain during install
        alg = cv2.KAZE_create(threshold=0.0001)
        # Dinding image keypoints
        kps = alg.detect(image)
        # Getting first 32 of them. 
        # Number of keypoints is varies depend on image size and color pallet
        # Sorting them based on keypoint response value(bigger is better)
        kps = sorted(kps, key=lambda x: -x.response)[:vector_size]
        # computing descriptors vector
        kps, dsc = alg.compute(image, kps)
        # Flatten all of them in one big vector - our feature vector
        if dsc is None:
            print(dsc)
            print(image_path)
        dsc = dsc.flatten()
        # Making descriptor of same size
        # Descriptor vector size is 64
        needed_size = (vector_size * 64)
        if dsc.size < needed_size:
            # if we have less the 32 descriptors then just adding zeros at the
            # end of our feature vector
            dsc = np.concatenate([dsc, np.zeros(needed_size - dsc.size)])
    except cv2.error as e:
        print ('Error: ', e)
        return None

    return dsc


#EXTRACTOR FROM PATHS FROM CSV
def feature_extractor_from_path_in_csv(dataframe, directory_path, path_string_name, label_string_name, pickled_db_path="features_from_imagepaths_csv.pck"):
    paths = dataframe[path_string_name].tolist()
    labels = dataframe[label_string_name].tolist()
    
    
    result = {}
    for index, f in enumerate(paths):
        
        f = os.path.join(directory_path, labels[index]+'_128/'+f)
        if index%1000 == 0:  
            print(index)
            print ('Extracting features from image: %s label: %s' %(f,labels[index]) )
        name = labels[index]+'_'+str(index)
        result[name] = extract_features(f)
        
    # saving all our feature vectors in pickled file
    with open(pickled_db_path, 'wb') as fp:
        pickle.dump(result, fp)
        print("<------ Conversion completed ------>\n <------ File saved as features_from_imagepaths_csv.pcknow ------> \n Now convert it into .csv")

def feature_extractor_pixels_in_csv(dataframe, pickled_db_path="features_from_imagepixels_csv.pck"):
    labels = dataframe['label'].tolist()
    
    result = {}
    for index, row in dataframe.iterrows():
        print ('Extracting features from image: %s label: %s' %(index,labels[index]) )
        name = labels[index]+'_'+str(index)
        result[name] = extract_features(row, is_path=False)
        
    # saving all our feature vectors in pickled file
    with open(pickled_db_path, 'wb') as fp:
        pickle.dump(result, fp)
        print("<------ Conversion completed ------>\n <------ File saved as features_from_imagepixels_csv.pcknow ------> \n Now convert it into .csv")


        
        
#BATCH EXTACTOR FROM DIRECTORY        
def batch_extractor(images_path, pickled_db_path="features.pck"):
    files = [os.path.join(images_path, p) for p in sorted(os.listdir(images_path))]

    result = {}
    for f in files:
        print ('Extracting features from image %s' % f)
        name = f.split('/')[-1].lower()
        result[name] = extract_features(f)
        
    # saving all our feature vectors in pickled file
    with open(pickled_db_path, 'wb') as fp:
        pickle.dump(result, fp)
        print("Conversion completed - now convert in .csv")

### Matcher Class

<div style='background-color:#FD7575; padding:3px;'>
    <h3>Don't change</h3>
</div>

In [3]:
class Matcher(object):

    def __init__(self, pickled_db_path="features.pck"):
        with open(pickled_db_path, 'rb') as fp:
            self.data = pickle.load(fp)
        
        self.names = []
        self.matrix = []
        for k, v in self.data.items():
            self.names.append(k)
            self.matrix.append(v)
        self.matrix = np.array(self.matrix)
        self.names = np.array(self.names)

    def cos_cdist(self, vector):
        # getting cosine distance between search image and images database
        v = vector.reshape(1, -1)
        return scipy.spatial.distance.cdist(self.matrix, v, 'cosine').reshape(-1)

    def match(self, image_path, topn=5):
        features = extract_features(image_path)
        img_distances = self.cos_cdist(features)
        # getting top 5 records
        nearest_ids = np.argsort(img_distances)[:topn].tolist()
        nearest_img_paths = self.names[nearest_ids].tolist()

        return nearest_img_paths, img_distances[nearest_ids].tolist()

### Show and Run Functions

<div style='background-color:#FD7575; padding:3px;'>
    <h3>Don't change</h3>
</div>

<br>

<div style='background-color:#ffe342; padding:3px;'>
    <h5>Use <code>run_batchFeatureExtractor</code> for a whole directory. You have to provide a directory path and all features will be extracted from all files in that directory</h5>
</div>

<br>

<div style='background-color:#ffe342; padding:3px;'>
    <h5>Use <code>run_featureExtractor_from_paths_in_csv</code> for a csv file with image paths as a column. Don't forget to modify the name of the column in csv which contains path and the column name which contains label. And the directory path as a string where the images are stored.</h5>
</div>

<br>

<div style='background-color:#ffe342; padding:3px;'>
    <h5>Use <code>run_featureExtractor_from_csv_with_images_pixels_as_features</code> for a csv file with image pixels in columns and the last column represents the label of the example with the name 'label'</h5>
</div>

In [4]:
def run_batchFeatureExtractor():
    ### HERE WE HAVE TO INPUT PATH OF IMAGES
    images_path = 'Data/bee1/'
    batch_extractor(images_path)
    
def run_featureExtractor_from_paths_in_csv():
    df = pd.read_csv('../DATA/TEST/test_labels.csv')
    path_column_name = 'image_name'
    label_column_name = 'label'
    directory_path = '../DATA/TEST/'
    feature_extractor_from_path_in_csv(df, directory_path, path_column_name, label_column_name)
    
def run_featureExtractor_from_csv_with_images_pixels_as_features():
    df = pd.read_csv('bee_wasps_museum.csv',  nrows=2)
    feature_extractor_pixels_in_csv(df)

    
    
#IGNORE FOR NOW
def show_img(path):
    #img = imread(path, mode="RGB")
    img = mpimg.imread(path)
    plt.imshow(img)
    plt.show()
def run_matcher():
    
    ### HERE WE HAVE TO INPUT PATH OF IMAGES
    images_path = 'Data/bee1/'
    files = [os.path.join(images_path, p) for p in sorted(os.listdir(images_path))]
    # getting 3 random images 
    sample = random.sample(files, 3)
    
    ma = Matcher('features.pck')
    
    for s in sample:
        print('Query image ==========================================')
        show_img(s)
        names, match = ma.match(s, topn=3)
        print('Result images ========================================')
        for i in range(3):
            # we got cosine distance, less cosine distance between vectors
            # more they similar, thus we subtruct it from 1 to get match value
            print('Match %s' % (1-match[i]))
            show_img(os.path.join(images_path, names[i]))


**Extracting from CSV with paths**

In [5]:
run_featureExtractor_from_paths_in_csv()

0
Extracting features from image: ../DATA/TEST/insect_128/1447696357891.jpg label: insect
1000
Extracting features from image: ../DATA/TEST/bee_128/240165415_d95cc9da5d_n.jpg label: bee
2000
Extracting features from image: ../DATA/TEST/bee_128/31154610876_bfa82d6968_n.jpg label: bee
3000
Extracting features from image: ../DATA/TEST/other_128/072_0007.jpg label: other
4000
Extracting features from image: ../DATA/TEST/insect_128/42820715035_62106ab5ee_n.jpg label: insect
5000
Extracting features from image: ../DATA/TEST/wasp_128/6545894085_3ba2c05f13_n.jpg label: wasp
6000
Extracting features from image: ../DATA/TEST/other_128/087_0007.jpg label: other
7000
Extracting features from image: ../DATA/TEST/butterfly_128/c2c11410-43e9-47b9-b908-ba2c681fb867.jpg label: butterfly
8000
Extracting features from image: ../DATA/TEST/insect_128/39494538771_8a7f707fda_n.jpg label: insect
9000
Extracting features from image: ../DATA/TEST/insect_128/41715571700_1a9f4d13da_n.jpg label: insect
10000
Extra

**Extracting from CSV with pixels**

In [None]:
run_featureExtractor_from_csv_with_images_pixels_as_features()

****
****
****
****
****

### Load pickle file (file with openCV features)

In [6]:
pickle_data = pickle.load(open("features_from_imagepaths_csv.pck", 'rb'))

In [7]:
features_and_lables = []
for key, value in pickle_data.items():
    features_and_lables.append(np.append(value, key.split('_')[0], axis=None))

In [8]:
#creating columns for the csv
column = []
for i in range(0,2048):
    column.append('feat_'+str(i+1))
column.append('label')

In [9]:
new_dataframe_for_openCV_features = pd.DataFrame(columns = column, data=features_and_lables)

In [10]:
new_dataframe_for_openCV_features.head(5)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_2040,feat_2041,feat_2042,feat_2043,feat_2044,feat_2045,feat_2046,feat_2047,feat_2048,label
0,-0.01360799,0.012686276,0.08979333,0.18310963,-0.059490398,-0.05856323,0.14917304,0.14765403,0.048695706,-0.10776488,...,0.18533805,0.012999573,-0.0986951,0.112543255,0.17864294,0.007801774,-0.04314236,0.06073712,0.10852275,insect
1,-0.12430215,-0.0075551295,0.14839983,0.015258294,0.06451725,-0.008546388,0.13460904,0.011955053,-0.051461782,-0.0029106424,...,0.03564827,-0.040600453,0.0181083,0.0513236,0.037484955,-0.1103688,0.021537693,0.11762852,0.045252196,butterfly
2,0.0024279747,-0.011422375,0.019622715,0.032322697,-0.017721262,-0.010654588,0.038415786,0.042495195,-0.001973614,0.0064269104,...,0.13815497,-0.0044013904,-0.05269962,0.13941616,0.13793239,-0.036049575,0.019900396,0.047929484,0.03682033,butterfly
3,-0.00074665213,-0.045294546,0.012390873,0.08399318,0.0107911015,-0.08767704,0.07473835,0.11837611,-0.032467537,-0.12758476,...,0.100029394,-0.092320055,0.027188428,0.13856795,0.090444595,-0.044537976,0.021177402,0.07438172,0.05527107,other
4,-0.021557711,-0.00021338205,0.023613863,0.017408762,-0.07281118,-0.01835224,0.07281118,0.03089315,-0.01958593,-0.029445702,...,0.18385535,0.022855816,0.0021406466,0.13598335,0.11853185,-0.024105359,0.031832285,0.07637017,0.0672026,bee


**Save as CSV**

In [11]:
new_dataframe_for_openCV_features.to_csv('test_opencv.csv',index=False)

In [13]:
testDFOpencV = pd.read_csv('../DATA/TEST/test_opencv.csv')
testDFOpencV.label.value_counts()

butterfly    2500
wasp         2500
bee          2500
insect       2500
other        2500
Name: label, dtype: int64

In [14]:
validDFOpencV = pd.read_csv('../DATA/VALID/valid_opencv.csv')
validDFOpencV.label.value_counts()

wasp         600
insect       600
bee          600
other        600
butterfly    600
Name: label, dtype: int64