In [None]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import shutil
from pathlib import Path

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Loading packages

In [None]:
import sys
from pathlib import Path

here_path = Path().resolve()
repo_path = here_path.parents[1]
sys.path.append(str(repo_path))

In [None]:
from py.utils import verifyDir, verifyFile, verifyType

In [None]:
from py.config import Config

cfg = Config()

np.random.seed(cfg.RANDOM_STATE)
cfg.DATA_PATH, cfg.MODEL_PATH

In [None]:
QSCORE_PATH=f"{cfg.DATA_PATH}pp1/Qscores/"
IMAGES_PATH = f"{cfg.DATA_PATH}pp1/images/"

### Loading data

In [None]:
from py.datasets import PlacePulse

pp = PlacePulse()

In [None]:
metrics = ["safety", "uniquely", "wealthy"]

In [None]:
data_df = pd.read_csv(f"{QSCORE_PATH}scores_raw.csv")
data_df.rename(columns={
                    'ID': 'image_id',
                    'QS_Upperclass': 'wealthy', 
                    'Error_QS_Upperclass': 'wealthy_err', 
                    'QS_Unique': 'uniquely', 
                    'Error_QS_Unique': 'uniquely_err', 
                    'QS_Safer': 'safety', 
                    'Error_QS_Safer': 'safety_err', 
                    'City': 'city',
                    'Lat': 'lat', 
                    "Lon":"long",
                    'File_Location': 'image_path',
                    }, 
               inplace=True)
data_df["city"] = data_df["city"].replace('New York City', 'New York')
data_df[['continent', 'country', 'city']] = data_df.apply( lambda row: pp.localize_point(row['lat'], row['long']), axis=1, result_type='expand')
data_df[metrics] = data_df[metrics].fillna(0.0)
data_df["image_path"] = data_df.apply(lambda row: f'{row["city"]}/{row["image_id"]}.jpg', axis=1)

### Mapping same locations

In [None]:
def verify_image_path(img_list):
    have_nan=0
    for img in img_list:
        if pd.isna(img):
            have_nan+=1
            
    return have_nan

In [None]:
location_df = data_df[["image_id", "lat", "long", "city", "country", "continent", "image_path"]].copy()
location_df.rename(columns={"image_id": "location_id"}, inplace=True)

In [None]:
repetitions_df = pd.pivot_table(location_df,
                      index=["lat", "long", "city", "country", "continent"],
                      values=["location_id", "image_path"],
                      aggfunc={
                          "location_id": list,
                          "image_path":list
                      }).reset_index()
repetitions_df["num_locations"] = repetitions_df["location_id"].apply(lambda x: len(x))
repetitions_df["image_id"] = repetitions_df["location_id"].apply(lambda x: x[0])

repetitions_df.sort_values(by=["num_locations"], inplace=True, ascending=False)
repetitions_df=repetitions_df[["image_id"] + repetitions_df.columns[:-1].to_list()].copy()
repetitions_df

In [None]:
repetitions_df.to_csv(f"{cfg.DATA_PATH}pp1/repeated_locations.csv", sep=";", index=False)

In [None]:
repetitions_df["num_image_nan"] = repetitions_df["image_path"].apply(lambda x: verify_image_path(x))

In [None]:
repetitions_df[(repetitions_df["num_locations"]!=repetitions_df["num_image_nan"]) & (repetitions_df["num_image_nan"]!=0)]

In [None]:
repetitions_df[(repetitions_df["num_image_nan"]==1)]

#### Identify images with repeated locations

In [None]:
fig, ax = plt.subplots(figsize=(16,8), nrows=1, ncols=1, sharex=False, sharey=False)

estado_df = repetitions_df["num_locations"].value_counts().reset_index()

sns_fig = sns.barplot(
            data=estado_df,
            x=estado_df.columns[0],
            y=estado_df.columns[1],
            ax=ax,
            color="steelblue",
            order=estado_df.sort_values(estado_df.columns[1], ascending=False)[estado_df.columns[0]]
           )

sns_fig.set_title(f"Number of different Ids per image", fontsize=30)
sns_fig.set_ylabel(f"Number of images", fontsize=20)
sns_fig.set_xlabel('Number of different ids', fontsize=20)

# rotate the axis ticklabels
_ = sns_fig.tick_params(axis='x', rotation=0, labelsize=20)

# rotate the axis ticklabels
_ = sns_fig.tick_params(axis='y', labelsize=20)

# add annotation
_ = sns_fig.bar_label(sns_fig.containers[0], fmt='%0.0f', fontsize=15,rotation=0)

# add a space on y for the annotations
#sns_fig.margins(x=0.1)
ax.grid(True)

# fig.savefig(f'{PROCESSED_IMAGES_PATH}/repeated_images_by_locations.png')
plt.show()

#### Number of images with more than 1 different ID per city

In [None]:
fig, ax = plt.subplots(figsize=(28,12), nrows=1, ncols=1, sharex=False, sharey=False)

estado_df = repetitions_df[repetitions_df["num_locations"] >1]["city"].value_counts().reset_index()

sns_fig = sns.barplot(
            data=estado_df,
            x=estado_df.columns[0],
            y=estado_df.columns[1],
            ax=ax,
            color="steelblue",
            #order=estado_df.sort_values(estado_df.columns[1], ascending=False)[estado_df.columns[0]]
           )

sns_fig.set_title(f"Number of locations with more than 1 image-IDs per city", fontsize=45)
sns_fig.set_ylabel(f"Number of images", fontsize=40)
sns_fig.set_xlabel('', fontsize=20)

# rotate the axis ticklabels
_ = sns_fig.tick_params(axis='x', rotation=90, labelsize=30)

# rotate the axis ticklabels
_ = sns_fig.tick_params(axis='y', labelsize=30)

# add annotation
_ = sns_fig.bar_label(sns_fig.containers[0], fmt='%0.0f', fontsize=15,rotation=0)

# add a space on y for the annotations
#sns_fig.margins(x=0.1)
ax.grid(True)

# fig.savefig(f'{PROCESSED_IMAGES_PATH}/number_images_with_different_id_per_city.png')
plt.show()

#### Mapping and deleting repetitions

In [None]:
locations_dict = {}
for same_values in repetitions_df[repetitions_df["num_locations"]>1]["location_id"].values:
    k = same_values[0]
    values = same_values[1:]
    for v in values:
        if v not in locations_dict:
            locations_dict[v] = k

In [None]:
data_df = data_df[ data_df["image_id"].isin( repetitions_df["image_id"].unique().tolist() ) ].copy()
data_df

In [None]:
data_df.to_csv(f"{QSCORE_PATH}scores.csv", sep=";", index=False)

### Aggregate statistics

In [None]:
cities = data_df["city"].unique().tolist()

In [None]:
agg_stats = {"city": cities,}

for metric in metrics:
    mean_metric_ = []
    std_metric_ = []
    num_images_ = []
    continent = []
    country = []
    for city in cities:
      city_df = data_df[data_df["city"]==city].copy()
      num_images_.append(len(city_df))
      mean_metric_.append(city_df[metric].mean())
      std_metric_.append(city_df[metric].std())
      country.append(city_df["country"].unique()[0])
      continent.append(city_df["continent"].unique()[0])
    agg_stats["country"] = country
    agg_stats["continent"] = continent
    agg_stats["num_images"] = num_images_
    agg_stats[metric+"_mean"] = mean_metric_
    agg_stats[metric+"_std"] = std_metric_

agg_stats_df = pd.DataFrame(agg_stats, columns=list(agg_stats.keys()))
agg_stats_df

In [None]:
agg_stats_df.to_csv(f"{QSCORE_PATH}aggregate_statistics.csv", index=False)

### Renaming images

In [None]:
years = ["2011", "2013", "2019"]

In [None]:
import re

pattern = re.compile(r"^id_(\d+)_\d+_\d+\.jpg$", re.IGNORECASE)

In [None]:
for dirpath, dirnames, filenames in os.walk(IMAGES_PATH):
    for fname in filenames:
        m = pattern.match(fname)
        if not m:
            continue

        image_id = m.group(1)
        new_name = f"{image_id}.jpg"

        old_path = os.path.join(dirpath, fname)
        new_path = os.path.join(dirpath, new_name)

        if "New York City" in new_path:
            new_path = new_path.replace("New York City", "New York")
            verifyDir( "/".join( new_path.split("/")[:-1] ) )
        
        if old_path == new_path:
            continue

        # Avoid overwriting if multiple resolutions exist
        if verifyFile(new_path):
            print(f"SKIP (exists): {new_path}  <- {old_path}")
            continue

        os.rename(old_path, new_path)
        #print(f"OK: {old_path} -> {new_path}")