```
Copyright 2021 Arjun Subramonian, Mike McKenna
Adapted from notebook by Twitter, Inc.
SPDX-License-Identifier: Apache-2.0
```

## Collect query data from Tumblr

Run notebook: tumblr-data/tumblr_query.ipynb

In [1]:
import sys
import json
from pathlib import Path
import pandas as pd
import os

In [2]:
HOME_DIR = Path("../").expanduser()
sys.path.append(str(HOME_DIR / "src"))
data_dir = HOME_DIR / Path("./data/")
data_dir.exists()

True

In [3]:
TUMBLR_IMAGE_ALL_DATA_DIR = '../../tumblr-scraper/out_all'
TUMBLR_IMAGE_ALL_EXCLUDE = '../../tumblr-scraper/to_exclude_all.json'
TUMBLR_IMAGE_ALL_EXCLUDE = json.load(open(TUMBLR_IMAGE_ALL_EXCLUDE))

TUMBLR_IMAGE_ENBY_DATA_DIR = '../../tumblr-scraper/out_enby'
TUMBLR_IMAGE_ENBY_EXCLUDE = '../../tumblr-scraper/to_exclude_enby.json'
TUMBLR_IMAGE_ENBY_EXCLUDE = json.load(open(TUMBLR_IMAGE_ENBY_EXCLUDE))

In [4]:
# Remove images without faces
# for filename in TUMBLR_IMAGE_ALL_EXCLUDE:
#     full_filename = TUMBLR_IMAGE_ALL_DATA_DIR + "/" + filename
#     !rm {full_filename}

In [5]:
# Remove images without faces
# for filename in TUMBLR_IMAGE_ENBY_EXCLUDE:
#     full_filename = TUMBLR_IMAGE_ENBY_DATA_DIR + "/" + filename
#     !rm {full_filename}

In [6]:
# Populate json with tumblr images
tumblr_data = {
  "head": {
    "vars": [
      "human",
      "image",
      "sex_or_gender",
      "ethnic_group",
      "occupation",
      "loc_aid"
    ]
  },
  "results": {
    "bindings": []
  }
}

for directory, exclude in [(TUMBLR_IMAGE_ALL_DATA_DIR, TUMBLR_IMAGE_ALL_EXCLUDE), (TUMBLR_IMAGE_ENBY_DATA_DIR, TUMBLR_IMAGE_ENBY_EXCLUDE)]:
    for filename in os.listdir(directory):
        if filename in exclude:
            continue
            
        filename_without_ext = filename.split('.')[0]
        tumblr_data["results"]["bindings"] += [
            {
                "human": {
                  "type": "uri",
                  "value": filename_without_ext
                },
                "image": {
                  "type": "uri",
                  "value": filename
                },
                "sex_or_gender": {
                  "type": "uri",
                  "value": "all" if directory == TUMBLR_IMAGE_ALL_DATA_DIR else "enby"
                },
                "ethnic_group": {
                  "type": "uri",
                  "value": "na"
                },
                "url": filename
            }
        ]
        

# Create dataset with scraped tumblr images
with open(data_dir / "./dataset.json", 'w') as fp:
    json.dump(tumblr_data, fp)

In [7]:
with open(data_dir / "./dataset.json") as fp:
    tumblr_data = json.load(fp)

len(tumblr_data["results"]["bindings"])

882

In [8]:
tumblr_data["results"]["bindings"][0]

{'human': {'type': 'uri', 'value': 'f6131d7d4797a5930853a85e273ab780c612a7c3'},
 'image': {'type': 'uri',
  'value': 'f6131d7d4797a5930853a85e273ab780c612a7c3.jpg'},
 'sex_or_gender': {'type': 'uri', 'value': 'all'},
 'ethnic_group': {'type': 'uri', 'value': 'na'},
 'url': 'f6131d7d4797a5930853a85e273ab780c612a7c3.jpg'}

In [9]:
tumblr_data["results"].keys()

dict_keys(['bindings'])

In [10]:
tumblr_data["results"]["bindings"][0].keys()

dict_keys(['human', 'image', 'sex_or_gender', 'ethnic_group', 'url'])

In [11]:
tumblr_data["results"]["bindings"][0]["human"]["value"].rsplit("/", 1)

['f6131d7d4797a5930853a85e273ab780c612a7c3']

In [12]:
REQUIRED_COLS = [
    "human",
    "image",
    "sex_or_gender",
    "ethnic_group",
]


def parse_row(row):
    data = {}
    for c in REQUIRED_COLS:
        value = row[c]["value"]
        if row[c]["type"] == "uri":
            value = value.rsplit("/", 1)[-1]
        data[c] = value
    url = row["url"]
    extension = Path(url.rsplit("/", 1)[-1]).suffix
    local_path = f"{data['human']}{extension}"
    data["url"] = url
    data["local_path"] = local_path
    return data

In [13]:
parse_row(tumblr_data["results"]["bindings"][0])

{'human': 'f6131d7d4797a5930853a85e273ab780c612a7c3',
 'image': 'f6131d7d4797a5930853a85e273ab780c612a7c3.jpg',
 'sex_or_gender': 'all',
 'ethnic_group': 'na',
 'url': 'f6131d7d4797a5930853a85e273ab780c612a7c3.jpg',
 'local_path': 'f6131d7d4797a5930853a85e273ab780c612a7c3.jpg'}

In [14]:
df = pd.DataFrame([parse_row(row) for row in tumblr_data["results"]["bindings"]])
df.head()

Unnamed: 0,human,image,sex_or_gender,ethnic_group,url,local_path
0,f6131d7d4797a5930853a85e273ab780c612a7c3,f6131d7d4797a5930853a85e273ab780c612a7c3.jpg,all,na,f6131d7d4797a5930853a85e273ab780c612a7c3.jpg,f6131d7d4797a5930853a85e273ab780c612a7c3.jpg
1,69fee9fa618dcbd6b46e1f7cfb9a20d2fde1ce51,69fee9fa618dcbd6b46e1f7cfb9a20d2fde1ce51.jpg,all,na,69fee9fa618dcbd6b46e1f7cfb9a20d2fde1ce51.jpg,69fee9fa618dcbd6b46e1f7cfb9a20d2fde1ce51.jpg
2,da317181505d7bebdf46d843f8328e0ceb5697f3,da317181505d7bebdf46d843f8328e0ceb5697f3.jpg,all,na,da317181505d7bebdf46d843f8328e0ceb5697f3.jpg,da317181505d7bebdf46d843f8328e0ceb5697f3.jpg
3,3803a5d80814f5b54e1f6562cae369a414a9657a,3803a5d80814f5b54e1f6562cae369a414a9657a.jpg,all,na,3803a5d80814f5b54e1f6562cae369a414a9657a.jpg,3803a5d80814f5b54e1f6562cae369a414a9657a.jpg
4,e0f5f893f8695e3dbd5809b796d7685366b43af2,e0f5f893f8695e3dbd5809b796d7685366b43af2.jpg,all,na,e0f5f893f8695e3dbd5809b796d7685366b43af2.jpg,e0f5f893f8695e3dbd5809b796d7685366b43af2.jpg


# Gather images for all rows in `df`

Put the required images for each tumblr id in `df` into the `OUTPUT_DIR` using the file name specified via the column `local_path`

In [15]:
OUTPUT_DIR = Path(data_dir / "./images/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

!rm -rf {OUTPUT_DIR / "*"}

for filename in os.listdir(TUMBLR_IMAGE_ALL_DATA_DIR):
    full_filename = TUMBLR_IMAGE_ALL_DATA_DIR + "/" + filename
    !cp {full_filename} {OUTPUT_DIR}
    
for filename in os.listdir(TUMBLR_IMAGE_ENBY_DATA_DIR):
    full_filename = TUMBLR_IMAGE_ENBY_DATA_DIR + "/" + filename
    !cp {full_filename} {OUTPUT_DIR}

df["file_exists"] = df["local_path"].apply(lambda x: (OUTPUT_DIR / x).exists())
df.file_exists.value_counts()

zsh:1: no matches found: ../data/images/*


True    882
Name: file_exists, dtype: int64

In [16]:
# df.file_exists.value_counts()[False]

## After putting all images in the folder run the next cell to update the dataframe with file status

In [17]:
df["file_exists"] = df["local_path"].apply(lambda x: (OUTPUT_DIR / x).exists())
# df.file_exists.value_counts()[False]

In [18]:
len(list(OUTPUT_DIR.glob("./*")))

882

In [19]:
df.file_exists.value_counts()

True    882
Name: file_exists, dtype: int64

In [20]:
df["ethnic_group"].value_counts()

na    882
Name: ethnic_group, dtype: int64

In [21]:
df.to_csv(data_dir / "./dataset.tsv", sep="\t", index=False)