In [332]:
import torch 
from torch import nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests, re

%matplotlib inline

In [333]:
import praw

user_agent = "Scraper 1.0 by /u/arnavg13"
reddit = praw.Reddit(
    client_id = "i69a55gneYS1cazqZCBE1g", 
    client_secret = "kxUTufTD_13UMTooaGidM3uNDAg7_A",
    user_agent = user_agent
)

In [None]:
subreddit = reddit.subreddit("progresspics")
posts = subreddit.hot(limit = 500) 
file_names, raw_labels, metadata = [], [], []
for i, post in enumerate(posts):
    url = post.url
    file_name = url.split("/")
    if not len(file_name):
        file_name = re.findall("/(.*?)", url)
    file_name = file_name[-1]
    if "." not in file_name:
        continue
    r = requests.get(url)
    with open("images/_" + str(i) + ".jpg", "wb") as f:
        f.write(r.content)
    file_names.append(file_name)
    raw_labels.append(post.title)
    metadata.append(post.link_flair_text)

Our first task is to get the exact heights from each person's annotations on the subreddit. We do this in an automated fashion for every example using the get_height function below. 

In [219]:
def get_height(i, raw_labels, metadata):
    """
    Using both the annotations and the flair text, we gather enough information to access 
    the exact height of each individual in inches. If there is no valid information in either
    datum, we will return None to indicate that this datapoint is not useful and needs to be removed. 
    """
    label = raw_labels[i]
    flair_text = metadata[i]
    if flair_text and flair_text != 'n/a':
        curr = flair_text.split()[1]
        H_feet = curr.split("'")[0]
        H_inch = curr.split("'")[1].split("”")[0]
        height = int(H_feet) * 12 + int(H_inch)
    else:
        curr = label.split()[0].split("/")[2]
        # Figure out the number of feet
        if '’' in curr:
            cand_feet = curr.split("’")[0]
            if '"' in cand_feet or '’' in cand_feet or '”' in cand_feet or "“" in cand_feet:
                return None
            if 'ft' in cand_feet:
                return None
            # Consider the case where we get cm instead -- we don't want this. 
            if int(cand_feet) > 7:
                return None
            feet = cand_feet
            if len(curr.split("’")) == 1:
                cand_inches = 0
            else:
                if curr.split("’")[1] == '':
                    cand_inches = 0
                elif "”" not in curr.split("’")[1]:
                    cand_inches = curr.split("’")[1][0]
                else:
                    cand_inches = curr.split("’")[1].split("”")[0]
            inches = cand_inches
        else:
            cand_feet = curr.split("'")[0]
            if '"' in cand_feet or '’' in cand_feet or '”' in cand_feet or "“" in cand_feet:
                return None
            if 'ft' in cand_feet:
                return None
            # Consider the case where we get cm instead -- we don't want this. 
            if int(cand_feet) > 7:
                return None
            feet = cand_feet
            if len(curr.split("'")) == 1:
                cand_inches = 0
            else:
                if curr.split("'")[1] == '':
                    cand_inches = 0
                elif "”" not in curr.split("'")[1]:
                    cand_inches = curr.split("'")[1][0]
                else:
                    cand_inches = curr.split("'")[1].split("”")[0]
            inches = cand_inches
        height = int(feet) * 12 + int(inches)
    return height

Next, we need to get the weights. Here, there is a bit more difficulty. These weights are the most important labels because height is ignorable, but weight is the most important feature we want to use to determine BMI and even simply predict the weight. As such, every weight is in the format of "[w1 > w2 = w1 - w2]" with units and other data in between. However, these labels only have two reallly important parts -- 1) the first weight mentioned and 2) the second weight mentioned. After we get these, we will store a tuple-per-person, now holding all their weights. For now, we will only capture those who have at most two weights. 

In [289]:
def get_weights(i, raw_labels):
    label = raw_labels[i]
    regex = r"\[([^)]+)\]"
    if '[' not in label and ']' not in label:
        return None
    if 'KG' in label or "pounds" in label:
        return None
    bracket_content = re.findall(regex, label)[0]
    if '[' in bracket_content or ']' in bracket_content or '>' not in bracket_content:
        return None
    if bracket_content.count(">") != 1 or '?' in bracket_content or 'kg' in bracket_content:
        return None
    if bracket_content.count(">") + bracket_content.count("<") >= 2:
        return None
    if '-' in bracket_content or '–' in bracket_content:
        return None
    else:
        bracket_content = bracket_content.replace("lbs", "")
        bracket_content = bracket_content.replace(" ", "").replace("llb", "")
        bracket_content = bracket_content.replace("lb", "").replace("Ibs", "")
        bracket_content = bracket_content.replace("lost", "").replace("+", "")
        bracket_content = bracket_content.replace("#", "").replace("bs", "")
        head, sep, tail = bracket_content.partition("=")
        first, second = head.split(">")
        if float(first) < 100 or len(first) > 3:
            return None
        else:
            first = float(first)
        if float(second) < 100 or len(second) > 3:
            return None
        else:
            second = float(second)
        return (first, second)

The following function is needed for filtration -- if we have a value that is None for either height or weight, we need to get rid of it from our dataset. 

In [304]:
def remove_value(i):
    raw_labels.pop(i)
    metadata.pop(i)
    file_names.pop(i)

Finally, let's build up three datastores -- one of the "heights", one of the "first_weight", and one of the "second_weight." With all three of these, the ith entry corresponds to one individual. 

In [305]:
def height_and_weight_labels():
    heights, weights_1, weights_2 = [], [], []
    for i in range(len(metadata)):
        height, weights = get_height(i, raw_labels, metadata), get_weights(i, raw_labels)
        if height and weights:
            heights.append(height)
            weights_1.append(weights[0])
            weights_2.append(weights[1])
    return heights, weights_1, weights_2

In [306]:
heights, weights_1, weights_2 = height_and_weight_labels()

In [309]:
heights[:10]

[62, 65, 68, 61, 67, 64, 63, 61, 61, 74]

In [310]:
weights_1[:10]

[210.0, 218.0, 170.0, 158.0, 189.0, 205.0, 210.0, 145.0, 245.0, 280.0]

In [311]:
weights_2[:10]

[124.0, 159.0, 142.0, 130.0, 158.0, 164.0, 120.0, 115.0, 155.0, 218.0]

The more difficult component now is to actually gather the corresponding images. As we know, we already have the weights stored as tuples, but these are naturally more than one. As such, we need to match the tuple element to the corresponding image. Sadly, each image we have collected is just that -- only an image. However, semantically, it contains multiple sub-images. The task we need to figure out is cropping out these subimages and using them individually, per person, and then storing them with their corresponding labels. 

In [322]:
from PIL import Image
import glob
for filename in glob.glob('images/*.jpg'): #assuming gif
    im=Image.open(filename)