In [1]:
import os, sys
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import shutil
from pathlib import Path

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Loading packages

In [3]:
import sys
from pathlib import Path

here_path = Path().resolve()
repo_path = here_path.parents[1]
sys.path.append(str(repo_path))

In [4]:
from py.utils import verifyDir, verifyFile, verifyType

In [5]:
from py.config import Config

cfg = Config()

np.random.seed(cfg.RANDOM_STATE)
cfg.DATA_PATH, cfg.MODEL_PATH

('/media/felipe/DATA19/datasets/', '/media/felipe/DATA19/models/')

In [6]:
QSCORE_PATH=f"{cfg.DATA_PATH}pp2/{cfg.SCORING_METHOD}/{cfg.PLACE_LEVEL}/"
IMAGES_PATH = f"{cfg.DATA_PATH}pp2/images/"
COMPARISONS_PATH=f"{cfg.DATA_PATH}pp2/comparisons/"

In [7]:
verifyDir(QSCORE_PATH)

### Loading data

In [8]:
%%time
comparisons_df = pd.read_csv(zipfile.ZipFile(f"{COMPARISONS_PATH}comparisons.zip").open("comparisons.csv"), sep=";", low_memory=False)
comparisons_df

CPU times: user 6.53 s, sys: 406 ms, total: 6.94 s
Wall time: 6.94 s


Unnamed: 0,vote_id,left_id,right_id,winner,category,voter_uniqueid,timestamp,left_lat,left_long,left_continent,left_country,left_city,left_image_path,right_lat,right_long,right_continent,right_country,right_city,right_image_path,user_id
0,5815426b02a8dc05afdf2b74,50e5f7d4d7c3df413b00056a,50f42c0dfdc9f065f00017bc,left,safety,8f5fe1b8602349f7bdf8a27e5ae75a48,2016-10-29 20:44:27.420000,22.310524,114.170637,Asia,China,Hong Kong,Hong Kong/50e5f7d4d7c3df413b00056a.JPG,52.302266,20.989567,Europe,Poland,Warsaw,Warsaw/50f42c0dfdc9f065f00017bc.JPG,user_0
1,51f2e453ad93a763cc1d2e9c,50e5f7d4d7c3df413b00056a,50f562ddfdc9f065f0005af3,right,safety,3a666b3c1ca84ccfa0cdd3c7423228f2,2013-07-26 17:04:19.551000,22.310524,114.170637,Asia,China,Hong Kong,Hong Kong/50e5f7d4d7c3df413b00056a.JPG,1.285515,103.810072,Asia,Singapore,Singapore,Singapore/50f562ddfdc9f065f0005af3.JPG,user_1
2,546fd72802a8dc4bc0758a3b,50e5f7d4d7c3df413b00056a,50f5eb24fdc9f065f000804a,right,safety,b544eec9acb74a2f8a7f464560fe6242,2014-11-21 19:22:00.323000,22.310524,114.170637,Asia,China,Hong Kong,Hong Kong/50e5f7d4d7c3df413b00056a.JPG,-22.807149,-43.413760,South America,Brasil,Rio De Janeiro,Rio De Janeiro/50f5eb24fdc9f065f000804a.JPG,user_2
3,537e474402a8dc0677005191,50e5f7d4d7c3df413b00056a,50f5eb4afdc9f065f00081bb,equal,beautiful,2035397a7a80404ab026246c9e42d991,2014-05-22 14:51:48.615000,22.310524,114.170637,Asia,China,Hong Kong,Hong Kong/50e5f7d4d7c3df413b00056a.JPG,-22.805310,-43.427501,South America,Brasil,Rio De Janeiro,Rio De Janeiro/50f5eb4afdc9f065f00081bb.JPG,user_3
4,523376dead93a760f8daa56b,50e5f7d4d7c3df413b00056a,513cbda1fdc9f035870010d9,left,safety,42e405acdb5f439a94f37df2121f0b1a,2013-09-13 16:34:38.724000,22.310524,114.170637,Asia,China,Hong Kong,Hong Kong/50e5f7d4d7c3df413b00056a.JPG,29.838246,-95.312139,North America,USA,Houston,Houston/513cbda1fdc9f035870010d9.JPG,user_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1546016,51f80ed7ad93a780c8788fc6,5185cd21fdc9f03fd5000cdf,513e6465fdc9f0358700b812,left,wealthy,eb9cb07d831a4888898ee8eb8649b52f,2013-07-30 15:07:03.074000,45.479466,-73.662663,North America,Canada,Montreal,Montreal/5185cd21fdc9f03fd5000cdf.JPG,35.058854,135.756810,Asia,Japan,Kyoto,Kyoto/513e6465fdc9f0358700b812.JPG,user_3350
1546017,592c770302a8dc05afdfa4f3,5185cd21fdc9f03fd5000cdf,513e6c96fdc9f0358700c1ac,left,safety,479c83d12ecd4c2db4cc46c56bc353b3,2017-05-29 15:31:15.835000,45.479466,-73.662663,North America,Canada,Montreal,Montreal/5185cd21fdc9f03fd5000cdf.JPG,35.594405,139.765548,Asia,Japan,Tokyo,Tokyo/513e6c96fdc9f0358700c1ac.JPG,user_94518
1546018,543e49d002a8dc4bbe75863a,5185cd21fdc9f03fd5000cdf,51408cabfdc9f0358700eb8a,right,lively,cf6fd73f3bab40c597ad5d57f24ea025,2014-10-15 06:17:52.460000,45.479466,-73.662663,North America,Canada,Montreal,Montreal/5185cd21fdc9f03fd5000cdf.JPG,38.760147,-9.105329,Europe,Portugal,Lisbon,Lisbon/51408cabfdc9f0358700eb8a.JPG,user_4255
1546019,586ba57302a8dc05afdf5c3b,5185cd21fdc9f03fd5000cdf,51409d77fdc9f049260012ff,left,safety,8459e27a9f104cf19d84a3d1bb9d8971,2017-01-03 08:21:55.946000,45.479466,-73.662663,North America,Canada,Montreal,Montreal/5185cd21fdc9f03fd5000cdf.JPG,41.913943,12.425267,Europe,Italy,Rome,Rome/51409d77fdc9f049260012ff.JPG,user_28490


#### Calculate Scores

In [9]:
from py.datasets.comparisons import Comparisons

comp = Comparisons(comparisons_df, method_name=cfg.SCORING_METHOD, place_level=cfg.PLACE_LEVEL)

In [10]:
metrics = comp.get_metrics()
metrics

<StringArray>
['safety', 'beautiful', 'wealthy', 'lively', 'boring', 'depressing']
Length: 6, dtype: str

In [11]:
columns_to_keep= ["image_id", "lat", "long", "city", "country", "continent", "AHPweight", "AHPScore", "EloRating", "EloScore", "Qscore"]

In [12]:
%%time
summary_df = pd.DataFrame()

for metric in tqdm(metrics):
    verifyDir(f"{QSCORE_PATH}{metric}/")

    columns_change = {"AHPweight": f"{metric}_AHPweight", 
                      "AHPScore": f"{metric}",
                      "EloRating": f"{metric}_EloRating", 
                      "EloScore": f"{metric}",
                      "Qscore": metric,
                     }

    name_file = ""
    if "ahp" in cfg.SCORING_METHOD.lower():
        name_file = "weights"
    elif "elo" in cfg.SCORING_METHOD.lower():
        name_file = "ratings"
    else:
        name_file = "matches"

    if verifyFile(f"{QSCORE_PATH}{metric}/{name_file}.csv"):
        results_df = pd.read_csv(f"{QSCORE_PATH}{metric}/{name_file}.csv", sep=";", low_memory=False)

    else:
        if "ahp" in cfg.SCORING_METHOD.lower():
            comp.calculate(metric=metric, method="dict")
        else:
            comp.calculate(metric=metric)
        
        if "ahp" in cfg.SCORING_METHOD.lower() or "elo" in cfg.SCORING_METHOD.lower():
            comp.normalize(min_range=0, max_range=10)
        results_df = comp.get_scores()
        results_df.to_csv(f"{QSCORE_PATH}{metric}/{name_file}.csv", sep=";", index=False)

    results_df = results_df[[c for c in columns_to_keep if c in results_df.columns]].copy()
    results_df.rename(columns=columns_change, inplace=True)
    if len(summary_df)==0:
        summary_df = results_df.copy()
    else:
        summary_df = pd.merge(summary_df, results_df, how="outer", on=["image_id", "lat", "long", "city", "country", "continent"])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.01it/s]

CPU times: user 1.82 s, sys: 173 ms, total: 2 s
Wall time: 1.99 s





#### Adding image paths

In [13]:
base_path = Path(IMAGES_PATH)
img_path_list = np.sort([str(p.relative_to(base_path)) for p in base_path.glob('*/*.JPG')])
image_path_dict = {path.split("/")[-1].replace(".JPG",""): path for path in img_path_list}

In [14]:
%%time
summary_df["image_path"] = summary_df["image_id"].map(image_path_dict)
summary_df = summary_df[~summary_df["image_path"].isna()].copy()
summary_df.fillna(0., inplace=True)
summary_df

CPU times: user 39.3 ms, sys: 964 μs, total: 40.2 ms
Wall time: 39.3 ms


Unnamed: 0,image_id,lat,long,city,country,continent,safety,beautiful,wealthy,lively,boring,depressing,image_path
0,50e5f7d4d7c3df413b00056a,22.310524,114.170637,Hong Kong,China,Asia,4.135536,1.574074,2.962963,4.199346,5.000000,0.000000,Hong Kong/50e5f7d4d7c3df413b00056a.JPG
1,50e5f7d4d7c3df413b00056b,22.274799,114.192828,Hong Kong,China,Asia,3.657277,2.229437,5.277778,5.555556,7.777778,3.333333,Hong Kong/50e5f7d4d7c3df413b00056b.JPG
2,50e5f7d4d7c3df413b00056c,22.291117,114.147373,Hong Kong,China,Asia,4.514946,3.333333,3.333333,4.746693,3.611111,0.000000,Hong Kong/50e5f7d4d7c3df413b00056c.JPG
3,50e5f7d4d7c3df413b00056d,22.314273,114.177176,Hong Kong,China,Asia,4.852448,3.333333,5.083333,3.333333,8.333333,3.466270,Hong Kong/50e5f7d4d7c3df413b00056d.JPG
4,50e5f7d4d7c3df413b00056e,22.332412,114.204790,Hong Kong,China,Asia,4.919651,2.129630,3.680556,4.343857,2.500000,4.444444,Hong Kong/50e5f7d4d7c3df413b00056e.JPG
...,...,...,...,...,...,...,...,...,...,...,...,...,...
108433,5185d533fdc9f03fd500148d,45.635489,-73.514755,Montreal,Canada,North America,5.952381,5.833333,2.333333,3.452862,3.333333,9.166667,Montreal/5185d533fdc9f03fd500148d.JPG
108434,5185d533fdc9f03fd500148e,45.581265,-73.511511,Montreal,Canada,North America,4.797811,3.251984,2.777778,2.148291,7.222222,7.777778,Montreal/5185d533fdc9f03fd500148e.JPG
108435,5185d534fdc9f03fd500148f,45.591299,-73.520681,Montreal,Canada,North America,4.993214,6.888889,7.777778,6.176471,5.833333,2.611111,Montreal/5185d534fdc9f03fd500148f.JPG
108436,5185d534fdc9f03fd5001490,45.470656,-73.631901,Montreal,Canada,North America,4.551622,3.333333,5.000000,5.008547,3.611111,2.888889,Montreal/5185d534fdc9f03fd5001490.JPG


In [15]:
summary_df.to_csv(f"{QSCORE_PATH}scores.csv", sep=";", index=False)

#### Aggregate information

In [16]:
aggregate_mean_df = pd.pivot_table(summary_df,
                      index=["city", "country", "continent"],
                      values=["image_id", "safety", "beautiful", "wealthy", "lively", "boring", "depressing"],
                      aggfunc = {
                          "image_id": len,
                          "safety": "mean",
                          "beautiful": "mean",
                          "wealthy": "mean",
                          "lively": "mean",
                          "boring": "mean",
                          "depressing": "mean",
                      }).rename(columns={"image_id": "num_images"}).reset_index().set_index(["city", "country", "continent", "num_images"])
aggregate_mean_df.columns = [ f"{c}_mean" for c in aggregate_mean_df.columns ]

aggregate_std_df = pd.pivot_table(summary_df,
                      index=["city", "country", "continent"],
                      values=["image_id", "safety", "beautiful", "wealthy", "lively", "boring", "depressing"],
                      aggfunc = {
                          "image_id": len,
                          "safety": "std",
                          "beautiful": "std",
                          "wealthy": "std",
                          "lively": "std",
                          "boring": "std",
                          "depressing": "std",
                      }).rename(columns={"image_id": "num_images"}).reset_index().set_index(["city", "country", "continent", "num_images"])
aggregate_std_df.columns = [ f"{c}_std" for c in aggregate_std_df.columns ]

aggregate_df = pd.merge(aggregate_mean_df, aggregate_std_df, on=["city", "country", "continent", "num_images"])
aggregate_df.reset_index(inplace=True)

new_columns = ["city", "country", "continent", "num_images"]
for metric in metrics:
    new_columns.append(f"{metric}_mean")
    new_columns.append(f"{metric}_std")

aggregate_df = aggregate_df[new_columns].copy()
aggregate_df.to_csv(f"{QSCORE_PATH}aggregate_statistics.csv", sep=";", index=False)