In [1]:
import glob
import json
import re
import os
import sys
import concurrent
import numpy as np
from itertools import repeat,chain
from urllib.parse import urlparse
from urllib.parse import parse_qs


metadata_dir = '<PATH TO METADATA>'
metadata_dir += '/*.json'

#fields from below to copy from json files and keep in memory
fields = ['published_datetime','id','author','comments','liked_by','makes', 'remixed_from', 'customizer', 'description','category', 'title', 'tags']

#Takes in a filepath and a list of fields within the json to write to a dictionary
def get_metadata(filepath,fields):
    try:
        data = json.loads(open(filepath).read())
        return_data = []
        for item in data:
            return_data.append({field:value for field,value in item.items() if field in fields})
        return return_data
    except:
        return []

files = glob.glob(metadata_dir) 
print("Reading From {} Files...".format(len(files)))
data = [[] for i in range(len(files))]
#multiprocess reading in files
with concurrent.futures.ProcessPoolExecutor() as executor: 
    for i,data_part in enumerate(zip(files,executor.map(get_metadata,files,repeat(fields)))):
        _,data[i] = data_part
data = list(chain.from_iterable(data)) #flatten list of list of dictionaries to list of dictionaries
print("{} Things Read".format(len(data)))



Reading From 2937 Files...
1017687 Things Read


In [2]:
#gather which things are customized
customizers = [] #set of customizer-templates
for thing in data:
    if 'customizer' in thing and thing['customizer']:
        customizers.append(int(thing['id']))
customizers = list(set(customizers))

customized = [] #set of customized things
for thing in data:
    if 'remixed_from' in thing:
        for r_thing in thing['remixed_from']:
            if int(r_thing['thing_id']) in customizers:
                customized.append(int(thing['id']))
print("Found {} Customizers and {} Customized Things".format(len(customizers),len(customized)))

Found 9698 Customizers and 439690 Customized Things


In [3]:
num_chars = []
num_words = []
for thing in data:
    if 'description' in thing:
        description = ' '.join([' '.join(ht.values()) for ht in thing['description']])
        num_chars.append(len(description))
        num_words.append(len(description.split()))

In [4]:
print(np.median(num_chars))
print(np.median(num_words))

547.0
57.0


0.5013491417768188

In [6]:
#now get info on categories

In [7]:
categories = []#list of all categories
user_ints = {} #user -> list of interaction categories
user_creations = {}#user -> list of published things
for thing in data:
    users = []
    
    category = None
    if 'category' in thing and thing['category'] is not None:
        category = thing['category'].split('/')[1]
    else:
        continue
        
    if 'author' in thing:
        users.append(thing['author'])
        
        if thing['author'] not in user_creations:
            user_creations[thing['author']] = []
        user_creations[thing['author']].append(int(thing['id']))
    
    if 'liked_by' in thing:
        for user in thing['liked_by']:
            users.append(user['user'])
            
    if 'makes' in thing:
        for user in thing['makes']:
            users.append(user['user'])
    
    if 'comments' in thing:
        for comment in thing['comments']:
            if 'author' in comment:
                users.append(comment['author'])
    
    #add to user_ints
    for user in users:
        if user not in user_ints:
            user_ints[user] = []
        user_ints[user].append(category)
        
    #make a unique list of categories
    categories= list(set(categories + [category]))

In [8]:
#bin users based on Flath et al.
pi_users = []
customizer_users = []
nodesign_users = []
for user in user_ints.keys():
    if user in user_creations:
        things = user_creations[user]
    else:
        #No designs published
        nodesign_users.append(user)
        continue
    
    customized_count = 0
    for thing in things:
        if thing in customized:
            customized_count += 1
            
    customized_ratio = customized_count / len(things)
    if customized_ratio < 0.5:
        pi_users.append(user)
    else:
        customizer_users.append(user)    

In [None]:
pi_users = set(pi_users)
customizer_users = set(customizer_users)
nodesign_users = set(nodesign_users)

In [None]:
%%time
category_participation = []
for category in categories:
    num_users = 0
    num_pi = 0
    num_cust = 0
    num_nd = 0
    for user in user_ints.keys():
        for c in user_ints[user]:
            if c == category:
                num_users += 1
                #add based on user type
                if user in pi_users:
                    num_pi +=1
                elif user in customizer_users:
                    num_cust +=1
                elif user in nodesign_users:
                    num_nd +=1
                
                break
    category_participation.append((category,num_users,(num_pi,num_cust,num_nd)))
    print("\t{}:\t{}%\t({})".format(category,float(num_users)/len(user_ints.keys())*100.0,num_users))
    print("\t\t{}, {}, {}".format(num_pi,num_cust,num_nd))

	art:	28.023642713952096%	(135598)
	fashion:	21.828590323847312%	(105622)
	models:	24.894496455659578%	(120457)
	3d-printing:	45.698431396862794%	(221121)
	toys-and-games:	27.234381135428936%	(131779)
	hobby:	36.18430570194474%	(175085)
	household:	37.30526794386922%	(180509)
	tools:	19.099344865356397%	(92416)
	learning:	10.247173827680989%	(49583)
	gadgets:	29.132204931076526%	(140962)
CPU times: user 16h 22min 26s, sys: 7min 46s, total: 16h 30min 13s
Wall time: 22h 49min 34s


In [19]:
for cp in sorted(category_participation,key=lambda x:x[1]):
    print("\t{}:\t{}%\t({})".format(cp[0],float(cp[1])/len(user_ints.keys())*100.0,cp[1],cp[2]))
    print("\t\t{},\t{},\t{}".format(cp[2][0],cp[2][1],cp[2][2]))
    print("\t\t{}%,\t{}%,\t{}%".format(cp[2][0]/len(pi_users)*100.0,cp[2][1]/len(customizer_users)*100.0,cp[2][2]/len(nodesign_users)*100.0))

	learning:	10.247173827680989%	(49583)
		15263,	8579,	25741
		0.13134320651940073%,	0.09531374988889876%,	0.09270857719111847%
	tools:	19.099344865356397%	(92416)
		30092,	19432,	42892
		0.25895169826258313%,	0.21589192071815838%,	0.15447947993012912%
	fashion:	21.828590323847312%	(105622)
		25418,	30349,	49855
		0.2187303690827575%,	0.33718113945427075%,	0.17955736435504493%
	models:	24.894496455659578%	(120457)
		33991,	15267,	71199
		0.29250389391344755%,	0.16961825615500845%,	0.2564297419459401%
	toys-and-games:	27.234381135428936%	(131779)
		36337,	20860,	74582
		0.31269200650563217%,	0.23175717713980978%,	0.2686139273558913%
	art:	28.023642713952096%	(135598)
		34841,	34450,	66307
		0.2998184274613405%,	0.38274375611056793%,	0.23881075435342422%
	gadgets:	29.132204931076526%	(140962)
		40609,	25518,	74835
		0.34945399158398377%,	0.2835081326104346%,	0.2695251301075075%
	hobby:	36.18430570194474%	(175085)
		53883,	24791,	96411
		0.4636811896013149%,	0.27543107279352946%,	0.3472330

In [22]:
#print category distribution of designs
category_count = {}
for thing in data:
    category = None
    if 'category' in thing and thing['category'] is not None:
        category = thing['category'].split('/')[1]
    else:
        continue
    if category not in category_count:
        category_count[category]  = 0
    category_count[category] += 1
cc = sorted(category_count.items(),key=lambda x:x[1])
for category,count in cc:
    print("{}\t{}\t{}".format(category,count,count/len(data)))

learning	16469	0.01618277525408107
models	43745	0.04298472909647072
tools	55462	0.05449809224250678
gadgets	61460	0.06039184936036326
toys-and-games	66555	0.065398300263244
fashion	110465	0.1085451617245774
hobby	111073	0.10914259492358652
art	123367	0.1212229300364454
3d-printing	184697	0.1814870387457047
household	191418	0.18809123040777764


In [35]:
#print category distribution of designs
category_count = {} #category to tuple of customizer, non-cust remix, non
for thing in data:
    category = None
    if 'category' in thing and thing['category'] is not None:
        category = thing['category'].split('/')[1]
    else:
        continue
    if category not in category_count:
        category_count[category]  = [0,0,0]
        
    if int(thing['id']) in customized:
        category_count[category][0] += 1
    elif 'remixed_from' in thing and len(thing['remixed_from'])>0:
        category_count[category][1] += 1
    else:
        category_count[category][2] += 1

TypeError: Can't convert 'list' object to str implicitly

In [43]:
cc = sorted(category_count.items(),key=lambda x:x[1][0]+x[1][1]+x[1][2])
print("category\tcutomized, non-customized remix, orginal")
print("--------------------------------------------------")
for category,(c1,c2,c3) in cc:
    print("{}\t{}, {}, {}".format(category,c1/len(data),c2/len(data),c3/len(data)))

category	cutomized, non-customized remix, orginal
--------------------------------------------------
learning	0.00564613677879348, 0.0010356818943348986, 0.00950095658095269
models	0.005746364058890405, 0.004751952221066005, 0.032486412816514314
tools	0.029971887230553204, 0.0030254881903768053, 0.02150071682157677
gadgets	0.023747969660612742, 0.006088315955691682, 0.030555563744058833
toys-and-games	0.023320529789611147, 0.007454158302110571, 0.03462361217152229
fashion	0.08601367611063127, 0.0028476338992244176, 0.019683851714721718
hobby	0.022794827879298842, 0.011303082381911137, 0.07504468466237654
art	0.08053163693748668, 0.004832527093300789, 0.035858766005657926
3d-printing	0.02431101114586312, 0.027639146417316917, 0.12953688118252468
household	0.11671663291365617, 0.007762701105546204, 0.06361189638857527


In [None]:
category_participation = []
for category in categories:
    num_users = 0
    for user in user_ints.keys():
        for c in user_ints[user]:
            if c == category:
                num_users += 1
                break
    category_participation.append((category,num_users))
    print("\t{}:\t{}%\t({})".format(category,float(num_users)/len(user_ints.keys())*100.0,num_users))

In [9]:
categories = []#list of all categories
user_ints = {} #user -> list of interaction categories
for thing in data:
    users = []
    
    category = None
    if 'category' in thing and thing['category'] is not None:
        category = thing['category']
    else:
        continue
        
    if 'author' in thing:
        users.append(thing['author'])
    
    if 'liked_by' in thing:
        for user in thing['liked_by']:
            users.append(user['user'])
            
    if 'makes' in thing:
        for user in thing['makes']:
            users.append(user['user'])
    
    if 'comments' in thing:
        for comment in thing['comments']:
            if 'author' in comment:
                users.append(comment['author'])
    
    #add to user_ints
    for user in users:
        if user not in user_ints:
            user_ints[user] = []
        user_ints[user].append(category)
        
    #make a unique list of categories
    categories= list(set(categories + [category]))

In [13]:
category_participation = []
for category in categories:
    num_users = 0
    for user in user_ints.keys():
        for c in user_ints[user]:
            if c == category:
                num_users += 1
                break
    category_participation.append((category,num_users))
for cp in sorted(category_participation,key=lambda x:x[1]):
    print("\t{}:\t{}%\t({})".format(cp[0],float(cp[1])/len(user_ints.keys())*100.0,cp[1]))

	/fashion/earrings:	1.0525554384442102%	(5093)
	/fashion/glasses:	1.225742451484903%	(5931)
	/models/model-furniture:	1.3392026784053568%	(6480)
	/toys-and-games/chess:	1.4687829375658752%	(7107)
	/toys-and-games/playsets:	1.5696364726062788%	(7595)
	/models/food-and-drink:	1.8118502903672475%	(8767)
	/toys-and-games/dice:	2.048277429888193%	(9911)
	/learning/math:	2.1733110132886932%	(10516)
	/fashion/bracelets:	2.383078099489532%	(11531)
	/learning/biology:	2.42255151176969%	(11722)
	/learning/physics-and-astronomy:	2.4804182941699215%	(12002)
	/learning:	2.6116518899704464%	(12637)
	/gadgets/tablet:	2.711265422530845%	(13119)
	/art/coins-and-badges:	2.7325521317709303%	(13222)
	/fashion:	2.891479116291566%	(13991)
	/toys-and-games/puzzles:	2.8916857833715666%	(13992)
	/models/model-robots:	2.9505859011718023%	(14277)
	/fashion/rings:	3.101866203732407%	(15009)
	/household/pets:	3.106826213652427%	(15033)
	/art/interactive-art:	3.2240064480128963%	(15600)
	/gadgets/audio:	3.504660342