In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import shutil
from pathlib import Path

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Loading packages

In [3]:
import sys
from pathlib import Path

here_path = Path().resolve()
repo_path = here_path.parents[1]
sys.path.append(str(repo_path))

In [4]:
from py.utils import verifyDir, verifyFile, verifyType

In [5]:
from py.config import Config

cfg = Config()

np.random.seed(cfg.RANDOM_STATE)
cfg.DATA_PATH, cfg.MODEL_PATH

('/media/felipe/DATA19/datasets/', '/media/felipe/DATA19/models/')

In [6]:
RAW_PATH=f"{cfg.DATA_PATH}pp2/raw_data/"
IMAGES_PATH = f"{cfg.DATA_PATH}pp2/images/"
QRANKS_PATH = f"{cfg.DATA_PATH}pp2/Qranks/"

In [7]:
verifyDir(QRANKS_PATH)

### Loading data

In [8]:
from py.datasets import PlacePulse

pp = PlacePulse()

### Studies/categories evaluated

In [9]:
studies_df = pd.read_csv(f"{RAW_PATH}studies.tsv", sep="\t").rename(columns={"_id":"study_id"})
# changing category names
studies_df["category"]=studies_df["study_question"].apply(lambda x: pp.map_category(x))
# deleting owner, study_name, study_public, study_question
studies_df = studies_df[["study_id", "category", "num_votes"]]
studies_df.sort_values(by="num_votes", inplace=True, ascending=False)
# studies_df.to_csv(f"{PROCESSED_PATH}studies.csv", index=False)
studies_df

Unnamed: 0,study_id,category,num_votes
0,50a68a51fdc9f05596000002,safety,511037
1,50f62c41a84ea7c5fdd2e454,lively,367476
5,5217c351ad93a7d3e7b07a64,beautiful,220656
3,50f62cb7a84ea7c5fdd2e458,wealthy,174784
4,50f62ccfa84ea7c5fdd2e459,depressing,149361
2,50f62c68a84ea7c5fdd2e456,boring,144068


### Rank Scores (Qranks)

In [10]:
qranks_df = pd.read_csv(f"{RAW_PATH}qscores.tsv", sep="\t").rename(columns={"trueskill.score":"trueskill_score", "trueskill.stds.-1":"trueskill_std"})
# merging with studies
qranks_df = pd.merge(qranks_df, studies_df, how="left", on="study_id")
qranks_df = qranks_df[["location_id", "trueskill_score", "trueskill_std", "category"]].copy()
qranks_df.rename(columns={"location_id":"image_id"}, inplace=True)
qranks_df.sort_values(by=["image_id"], inplace=True)
qranks_df

Unnamed: 0,image_id,trueskill_score,trueskill_std,category
520452,50e5f7d4d7c3df413b00056a,22.570491,4.284321,lively
168533,50e5f7d4d7c3df413b00056a,14.955635,4.544466,beautiful
502999,50e5f7d4d7c3df413b00056a,22.358495,2.185529,safety
6799,50e5f7d4d7c3df413b00056a,22.862591,5.960932,boring
77268,50e5f7d4d7c3df413b00056a,25.000000,8.333333,depressing
...,...,...,...,...
304035,5185d53bfdc9f03fd5001493,25.000000,6.474365,beautiful
616297,5185d53bfdc9f03fd5001493,36.794676,5.254008,wealthy
658298,5185d53bfdc9f03fd5001493,28.359978,2.181525,safety
650198,5185d53bfdc9f03fd5001493,15.820457,6.117387,boring


In [11]:
pp2_qranks_df = pd.pivot_table(qranks_df,
                index=['image_id'],
                columns=["category"],
                ).reset_index()
pp2_qranks_df.columns = ["_".join([col[1], col[0].split("_")[1]]) if len(col[1])>=1 and len(col[0])>=1 else "".join(col) for col in pp2_qranks_df.columns]
pp2_qranks_df

Unnamed: 0,image_id,beautiful_score,boring_score,depressing_score,lively_score,safety_score,wealthy_score,beautiful_std,boring_std,depressing_std,lively_std,safety_std,wealthy_std
0,50e5f7d4d7c3df413b00056a,14.955635,22.862591,25.000000,22.570491,22.358495,16.266914,4.544466,5.960932,8.333333,4.284321,2.185529,5.952272
1,50e5f7d4d7c3df413b00056b,18.821060,31.596729,27.417381,19.954589,22.081561,25.150958,4.798056,6.812248,5.827463,4.279072,2.855162,4.524901
2,50e5f7d4d7c3df413b00056c,25.000000,20.212300,25.000000,21.347405,21.393349,28.458999,6.474365,5.369342,8.333333,2.183312,2.970363,6.104728
3,50e5f7d4d7c3df413b00056d,25.116229,33.286969,21.246639,22.884136,26.230766,27.252281,5.937892,6.389028,3.849299,6.216208,2.998062,4.136642
4,50e5f7d4d7c3df413b00056e,14.690601,18.047294,26.400204,23.688506,18.583226,27.192922,5.604868,6.365379,5.148536,2.799108,2.939962,4.226316
...,...,...,...,...,...,...,...,...,...,...,...,...,...
111385,5185d534fdc9f03fd500148f,28.947155,27.510418,19.020828,27.953605,28.836854,29.769550,5.350052,5.152094,4.756786,4.767030,2.220648,7.128595
111386,5185d534fdc9f03fd5001490,24.016375,25.489629,17.737607,28.114697,20.593552,28.298277,5.915084,4.998984,6.108786,3.844311,3.688897,5.625583
111387,5185d536fdc9f03fd5001491,21.326749,23.638018,28.730627,21.966041,26.230962,20.230450,4.279181,5.850875,3.993591,3.472348,3.747897,7.128595
111388,5185d539fdc9f03fd5001492,24.630742,21.024050,20.230450,31.295953,26.025764,25.687452,4.877001,4.901234,7.128595,2.825604,3.944938,5.590527


### Saving final file

In [12]:
%%time
pp2_qranks_df.to_csv(f"{QRANKS_PATH}ranks.csv", sep=";", index=False)

CPU times: user 780 ms, sys: 11.2 ms, total: 792 ms
Wall time: 1.02 s


In [13]:
%%time
zip_path = f"{QRANKS_PATH}/ranks.zip"
csv_path = f"{QRANKS_PATH}/ranks.csv"

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_LZMA) as zipObj:
    zipObj.write(csv_path, arcname="ranks.csv")
    zipObj.close()

CPU times: user 11.9 s, sys: 47.7 ms, total: 11.9 s
Wall time: 12 s
