In [1]:
import os, sys
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import shutil
from pathlib import Path

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Loading packages

In [3]:
import sys
from pathlib import Path

here_path = Path().resolve()
repo_path = here_path.parents[1]
sys.path.append(str(repo_path))

In [4]:
from py.utils import verifyDir, verifyFile, verifyType

In [5]:
from py.config import Config

cfg = Config()

np.random.seed(cfg.RANDOM_STATE)
cfg.DATA_PATH, cfg.MODEL_PATH

('/media/felipe/DATA19/datasets/', '/media/felipe/DATA19/models/')

In [6]:
QSCORE_PATH=f"{cfg.DATA_PATH}pp1/Qscores/"
IMAGES_PATH = f"{cfg.DATA_PATH}pp1/images/"
FEATURES_PATH = f"{cfg.MODEL_PATH}pp1/{cfg.YEAR_STUDIED}/features/"

In [7]:
verifyDir(FEATURES_PATH)

### Loading data

In [8]:
%%time
data_df = pd.read_csv(f"{QSCORE_PATH}scores.csv", sep=";", low_memory=False)
data_df["image_path"] = f"{IMAGES_PATH}{cfg.YEAR_STUDIED}/" + data_df["image_path"]
data_df.sort_values(by=[cfg.PERCEPTION_METRIC], ascending=False, inplace=True)
data_df

CPU times: user 11.8 ms, sys: 7.95 ms, total: 19.7 ms
Wall time: 18.3 ms


Unnamed: 0,image_id,safety,safety_err,uniquely,uniquely_err,wealthy,wealthy_err,lat,long,city,Heading,Pitch,image_path,continent,country
1130,4340,8.350865,0.621166,4.528970,0.668607,7.245005,0.577930,42.3354,-71.1407,Boston,85.0,-3.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
891,4196,8.327766,0.556107,5.022737,0.833578,7.498167,0.656081,40.6780,-73.9703,New York,329.0,1.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
1117,4189,8.327241,0.602638,7.108436,0.738665,7.594670,0.634063,40.6948,-73.9712,New York,33.0,11.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
370,3527,8.245561,0.456143,7.760654,0.836348,8.142057,0.693277,40.6795,-73.9731,New York,324.0,3.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
303,4727,8.236001,0.510115,7.601103,0.631219,9.164302,0.627555,42.3417,-71.0792,Boston,105.0,6.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,3520,0.762055,0.460119,2.167670,0.572258,1.766562,0.470312,40.6307,-74.0047,New York,346.0,-1.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
301,4725,0.560845,0.582538,2.764210,0.757413,0.966390,0.523086,42.3812,-71.0890,Boston,15.0,-3.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
2276,4904,0.517857,0.527475,2.454082,0.681900,1.576652,0.381375,42.3888,-71.0872,Boston,8.0,-2.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA
1188,4162,0.465477,0.462276,1.970528,0.661715,1.627882,0.514609,40.7096,-73.9278,New York,28.0,4.0,/media/felipe/DATA19/datasets/pp1/images/2011/...,North America,USA


### Feature Extraction

In [9]:
features_dict = {"image_id": data_df["image_id"].tolist(), 
                 "image_path": data_df["image_path"].tolist(),
                 "gist": [],
                 "sift": [],
                 "fisher": [],
                }

#### GIST

In [10]:
from py.features.gist import extract_gist_features

In [11]:
%%time
gist_features = []
for img_path in tqdm(features_dict["image_path"]):
    desc = extract_gist_features(img_path)
    if desc is not None:
        gist_features.append(desc)
    else:
        gist_features.append(None)
        print(f"Not Extracted {len(desc)} descriptors from {img_path}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2456/2456 [04:14<00:00,  9.64it/s]

CPU times: user 4min 14s, sys: 338 ms, total: 4min 15s
Wall time: 4min 14s





In [12]:
features_dict["gist"] = np.array(gist_features).tolist()

In [13]:
print(f"GIST shape: {gist_features[0].shape}")
print(f"GIST norm: {np.linalg.norm(gist_features):.4f}")

GIST shape: (256,)
GIST norm: 0.6049


#### SIFT

In [14]:
from py.features.sift import extract_sift_descriptors

In [15]:
%%time
sift_descriptors = []
for img_path in tqdm(features_dict["image_path"]):
    desc = extract_sift_descriptors(img_path)
    if desc is not None:
        sift_descriptors.append(desc)
        #print(f"Extracted {len(desc)} descriptors from {img_path}")
    else:
        sift_descriptors.append(None)
        print(f"Not Extracted {len(desc)} descriptors from {img_path}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2456/2456 [00:18<00:00, 135.29it/s]

CPU times: user 1min 40s, sys: 24.8 s, total: 2min 5s
Wall time: 18.2 s





In [16]:
features_dict["sift"] = sift_descriptors

In [17]:
print(f"SIFT shape: {sift_descriptors[0].shape}")

SIFT shape: (1447, 128)


In [18]:
shapes = [a.shape for a in sift_descriptors]
unique_shapes = sorted(set(shapes))
print("Num arrays:", len(sift_descriptors))
print("Unique shapes:", unique_shapes[:20], "..." if len(unique_shapes) > 20 else "")
print("How many unique shapes:", len(unique_shapes))

Num arrays: 2456
Unique shapes: [(3, 128), (96, 128), (132, 128), (133, 128), (137, 128), (151, 128), (155, 128), (168, 128), (169, 128), (174, 128), (182, 128), (189, 128), (191, 128), (195, 128), (204, 128), (207, 128), (211, 128), (216, 128), (218, 128), (221, 128)] ...
How many unique shapes: 1027


### Fisher

In [19]:
from py.features.fisher import FisherVectorEncoder

In [20]:
%%time
fv_encoder = FisherVectorEncoder(n_components=32)
fv_encoder.fit(features_dict["sift"])
print(f"Trained GMM with {fv_encoder.n_components} components")

Trained GMM with 32 components
CPU times: user 1h 50min 20s, sys: 1min 43s, total: 1h 52min 3s
Wall time: 10min 22s


In [21]:
%%time
fisher_vectors = []
for descriptor in tqdm(features_dict["sift"]):
    try:
        fisher_vec = fv_encoder.encode(descriptor)
        fisher_vectors.append(fisher_vec)
    except Exception as e:
        fisher_vectors.append(None)
        print(f"Error {e} extracting ...")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2456/2456 [00:25<00:00, 98.11it/s]

CPU times: user 7min 13s, sys: 84.2 ms, total: 7min 13s
Wall time: 25 s





In [22]:
features_dict["fisher"] = np.array(fisher_vectors).tolist()

In [23]:
print(f"Fisher Vector shape: {len(fisher_vectors[0])}")
print(f"Fisher Vector norm: {np.linalg.norm(fisher_vectors):.4f}")

Fisher Vector shape: 8192
Fisher Vector norm: 49.5580


### Saving features

In [24]:
%%time
import pickle

with open(f"{FEATURES_PATH}features.pkl", "wb") as f:
    pickle.dump(features_dict, f)

CPU times: user 2.07 s, sys: 557 ms, total: 2.62 s
Wall time: 904 ms
