In [1]:
import glob
import pandas as pd
from os import listdir
from datetime import datetime

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!git clone https://github.com/AdrianHRedhe/ID2223_Project.git

Cloning into 'ID2223_Project'...
remote: Enumerating objects: 28882, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 28882 (delta 39), reused 144 (delta 31), pack-reused 28730[K
Receiving objects: 100% (28882/28882), 356.04 MiB | 40.46 MiB/s, done.
Resolving deltas: 100% (48/48), done.
Updating files: 100% (28774/28774), done.


In [None]:
#rm -r ID2223_Project

Turn an image into a row of a dataframe depending on its filename

In [4]:
def image_path_to_df_row(path):
    fname = path.split('/')[-1]

    new_order_idx, rotation_nr, picture_nr, taken_at, scraped_at = fname.strip('.jpg').split('_')
    taken_at_dt = datetime.strptime(taken_at, '%B %Y')

    df_row = {'new_order_idx': int(new_order_idx),
              'rotation_nr': int(rotation_nr),
              'picture_nr': int(picture_nr),
              'taken_at': taken_at_dt,
              'scraped_at': scraped_at,
              'path_to_image': path
              }

    # adding some extra information on the dataset
    dataset_name, dataset_partition = path.split('/')[-3:-1]
    df_row['dataset_shard'] = dataset_partition

    dataset_type, dataset_version = dataset_name.split('_')[1:3]
    buffer = dataset_name.split('_')[-1]

    df_row ['dataset_type'] = dataset_type
    df_row ['dataset_version'] = dataset_version
    df_row ['buffer_meters'] = buffer
    return df_row

In [5]:
#images_path_ = '/content/drive/MyDrive/ID2223 - Project/Data/cropped_images/'
images_path_ = 'ID2223_Project/data/images/'
images_paths_ = glob.glob(f'{images_path_}/**/**/*.jpg')
images_df_rows = [image_path_to_df_row(path) for path in images_paths_]
images_df = pd.DataFrame(images_df_rows)

In [6]:
new_order_path_ = '/content/drive/MyDrive/ID2223 - Project/Data/buffer_files/'
new_order_csvs = [pd.read_csv(new_order_path_ + fname) for fname in listdir(new_order_path_)]
new_order_df = pd.concat(new_order_csvs)
new_order_df = new_order_df[['new_order_idx','sorted_idx','Location_idx','Google Location','PanoID']]

Here we want to ensure that all the locations are accounted for.

In [7]:
combined_df = pd.merge(images_df, new_order_df, on='new_order_idx')

We need to format the dataframe for the images a bit. To make sure  
that we know which of the images should be marked as query images.

In [8]:
def add_query_img_column(combined_df):
    combined_df = combined_df.sort_values('taken_at',ascending=False).sort_values(['new_order_idx','rotation_nr'])
    query_images = combined_df.groupby(['new_order_idx','rotation_nr']).first().reset_index()

    query_images_idx = [f'{no_id}_{pic_nr}'
                        for no_id, pic_nr
                        in zip(query_images.new_order_idx, query_images.picture_nr)
                        ]

    all_combined_image_idx = [f'{no_id}_{pic_nr}'
                            for no_id, pic_nr
                            in zip(combined_df.new_order_idx, combined_df.picture_nr)
                            ]

    is_query_image = [new_order_idx in query_images_idx for new_order_idx in all_combined_image_idx]
    combined_df['is_query_image'] = is_query_image
    return combined_df

combined_df = add_query_img_column(combined_df)

Next Up, some of the runs were interupted and therefore might have less images than ideal. Other locations simply did not have enough timemachine photots.

To ensure that this still works well with our models. Lets Remove all the images that do have less than 4 non query images per rotation and location.
In essance this means having 5 images at every location.

In [9]:
def remove_img_if_less_than_n_images(combined_df, min_photos_per_location, remove_early_stop=True):
    number_of_tm_images = combined_df.groupby(['new_order_idx','rotation_nr']).picture_nr.unique().reset_index()
    number_of_tm_images.picture_nr = number_of_tm_images.picture_nr.apply(lambda x: len(x))

    has_enough_images = number_of_tm_images[number_of_tm_images['picture_nr'] >= min_photos_per_location]

        # If the program terminated early there might not be enough images for every rotation.
        # These should be removed.
    if remove_early_stop:
        has_enough_images = has_enough_images[has_enough_images.rotation_nr == 3]

    has_enough_images_no_id = has_enough_images.new_order_idx.to_list()
    new_order_idxs = combined_df.new_order_idx.to_list()

    mask = [idx in has_enough_images_no_id for idx in new_order_idxs]
    combined_df = combined_df[mask]
    return combined_df

min_photos_per_location = 5
combined_df = remove_img_if_less_than_n_images(combined_df, min_photos_per_location)

In [10]:
# Make sure all columns are written in snake_case
combined_df.columns = ['new_order_idx', 'rotation_nr', 'picture_nr', 'taken_at', 'scraped_at',
       'path_to_image', 'dataset_shard', 'dataset_type', 'dataset_version',
       'buffer_meters', 'sorted_idx', 'location_idx', 'google_location',
       'panoid', 'is_query_image']

In [11]:
#combined_df.to_csv('/content/drive/MyDrive/ID2223 - Project/current_images_metadata.csv',index=False)

Lets take a look at how many images we have in each of the datasets

In [12]:
combined_df.groupby(['dataset_type','dataset_version']).count().reset_index().iloc[:,0:3]

Unnamed: 0,dataset_type,dataset_version,new_order_idx
0,Test,v1,1652
1,Test,v2,7804
2,Training,v1,5998
3,Training,v2,3964
4,Validation,v1,2540
5,Validation,v2,3502


Upload this to Hopsworks feature store

Had to downgrade urllib3 due to dependency conflicts with hopsworks

In [14]:
pip install -q hopsworks urllib3==1.26.0 #fastapi kaleido python-multipart uvicorn MarkupSafe>=2.1.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m832.2 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.7/136.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.3/170.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.4/83.4 kB[0m [31m9.9 MB/

In [15]:
import hopsworks
project = hopsworks.login(api_key_value='3AUfzmkHodq2ve3J.kh15KYDb6Xckmn3QZnS5VN9JlX8BHYgAs8jO9xRXggnMEnW2Y9M2JQDZybAM8IX9')
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.





Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/193686
Connected. Call `.close()` to terminate connection gracefully.


In [16]:
image_meta_data_fg = fs.get_or_create_feature_group(
        name = 'image_metadata_fg',
        description = 'Contains metadata such as when a photo was taken and at what location as well as the path to the photo in git. \
                        it also contains splits for training / test / validation',
        version = 1,
        primary_key = ['path_to_image'],
        online_enabled = True
        )

### Here we can choose to Overwrite or to insert missing. Either way, given how hopsworks works it should never add rows that already exists based on the path variable.

In [17]:
# Either Overwrite
image_meta_data_fg.insert(combined_df)#, overwrite=True)

Uploading Dataframe: 0.00% |          | Rows 0/25460 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: image_metadata_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/193686/jobs/named/image_metadata_fg_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x7f3cece0fa60>, None)

In [None]:
df_fg = image_meta_data_fg.read(read_options={"use_hive": True})



Finished: Reading data from Hopsworks, using Hive (1.72s) 


In [None]:
df_missing_from_the_fg = pd.concat([df_fg, combined_df], ignore_index=True).drop_duplicates(keep=False)

In [None]:
df_missing_from_the_fg

Unnamed: 0,new_order_idx,rotation_nr,picture_nr,taken_at,scraped_at,path_to_image,sorted_idx,location_idx,google_location,panoid,is_query_image


In [None]:
# Check if any missing. Then update the feature group.
if len(df_missing_from_the_fg) > 0:
  image_meta_data_fg.insert(df_missing_from_the_fg)

Uploading Dataframe: 0.00% |          | Rows 0/62 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: geolocalisation_image_metadata_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/193686/jobs/named/geolocalisation_image_metadata_fg_1_offline_fg_materialization/executions
