In [41]:
import os
import json
from renetti.ws.spiders.types import ScrapedEquipment
from typing import List, Tuple, Optional, TypedDict, Dict
from collections import defaultdict
from nltk.util import ngrams
import numpy as np

In [20]:
class Equipment(TypedDict):
    name: str
    image_links: List[str]
    mpn: Optional[List[str]]
    description: Optional[str]
    brands: Optional[List[str]]
    categories: Optional[List[str]]
    skus: Optional[List[str]]


class NGramFrequencyProp(TypedDict):
    one_grams: Dict[str, float]
    two_grams: Dict[str, float]
    three_grams: Dict[str, float]
    four_grams: Dict[str, float]

class EquipmentForAnalysis(TypedDict):
    name: str
    image_links: List[str]
    mpn: Optional[List[str]]
    description: Optional[str]
    brands: Optional[List[str]]
    categories: Optional[List[str]]
    skus: Optional[List[str]]
    one_grams: List[str]
    two_grams: List[str]
    three_grams: List[str]
    four_grams: List[str]
    n_gram_freq: Optional[NGramFrequencyProp]
    grouping_category: Optional[str]


In [21]:
scraped_data_file_paths = [
    f"{root}/scraped_data.json" for root, dirs, file_paths in os.walk("../files")
][1:] # remove root file dir

In [22]:
equipment_data = defaultdict(list)
equipment_data_missing_fields = defaultdict(list)

for file_path in scraped_data_file_paths:
    with open(file=file_path) as f:
        data: List[ScrapedEquipment] = json.load(f)
        for equipment_obj in data:
            equipment_obj['name'] = equipment_obj['name'].replace("/", " ").replace("  ", " ").replace("  ", " ").replace(" - ", " ").lower()


            brands = equipment_obj["brands"]
            name = equipment_obj['name']

            if len(brands) != 1:
                equipment_data_missing_fields[name.title()].append(equipment_obj)
            else:
                brand = brands[0]
                unique_name = f"({brand.title()}) {name.title()}"
                equipment_data[unique_name].append(equipment_obj)

In [23]:
def merge_duplicates(objs: List[ScrapedEquipment]) -> Equipment:
    name = None
    image_links = set()
    mpn = set()
    description = None
    brands = set()
    categories = set()
    skus = set()

    for obj in objs:
        if name is None:
            name = obj["name"]

        image_links.update(obj["image_links"])
        if obj["mpn"]:
            mpn.add(obj["mpn"])

        brands.update([b.lower() for b in obj["brands"]] or [])
        categories.update([c.lower() for c in obj["categories"]] or [])
        skus.update(obj["skus"] or [])

        if not description or (obj["description"] and len(description) < len(obj["description"])):
            description = obj["description"]

    return {
        'name': name,
        'image_links': list(image_links),
        'mpn': list(map(str.title, mpn)),
        'description': description,
        'brands': list(map(str.title, brands)),
        'categories': list(map(str.title, categories)),
        'skus': list(skus)
    }

In [24]:
dupped_equipment = {k:v for k, v in equipment_data.items() if len(v) > 1}
print(len(dupped_equipment))

249


In [25]:
de_deupped_equipment = [merge_duplicates(objs=objs) for key, objs in equipment_data.items()]

In [26]:
with open("../files/cleaned_scraped_equipment.json", "w") as f:
    json.dump(de_deupped_equipment, f, indent=3)

In [27]:
def generate_n_grams(name: str) -> Tuple:
    tokens = name.split(" ")
    return [
        list(ngrams(tokens, n=n))
        for n in range(1, 5)
    ]

In [28]:
n_grams = defaultdict(lambda: defaultdict(float))
equipments_for_analysis: List[EquipmentForAnalysis] = []
for obj in de_deupped_equipment:
    one_grams, two_grams, three_grams, four_grams = generate_n_grams(name=obj["name"])
    equipment_for_analysis: EquipmentForAnalysis = {
        'name': obj['name'],
        'image_links': obj['image_links'],
        'mpn': obj['mpn'],
        'description': obj['description'],
        'brands': obj['brands'],
        'categories': obj['categories'],
        'skus': obj['skus'],
        'one_grams': one_grams,
        'two_grams': two_grams,
        'three_grams': three_grams,
        'four_grams': four_grams,
    }
    for gram in one_grams:
        n_grams['one_grams'][gram] += 1
    for gram in two_grams:
        n_grams['two_grams'][gram] += 1
    for gram in three_grams:
        n_grams['three_grams'][gram] += 1
    for gram in four_grams:
        n_grams['one_grams'][gram] += 1
    equipments_for_analysis.append(equipment_for_analysis)

In [29]:
for equip in equipments_for_analysis:
    equip['n_gram_freq'] = {
        'one_grams': {v:n_grams['one_grams'][v]/len(n_grams['one_grams']) for v in equip['one_grams']},
        'two_grams': {v:n_grams['two_grams'][v]/len(n_grams['two_grams']) for v in equip['two_grams']},
        'three_grams': {v:n_grams['three_grams'][v]/len(n_grams['three_grams']) for v in equip['three_grams']},
        'four_grams': {v:n_grams['four_grams'][v]/len(n_grams['four_grams']) for v in equip['four_grams']},
    }

In [90]:
maxes = []
for equip in equipments_for_analysis:
    category = None


    gram_max = None
    gram_max_score = 0

    one_grams = equip['n_gram_freq']['one_grams']
    two_grams = equip['n_gram_freq']['two_grams']
    three_grams = equip['n_gram_freq']['three_grams']
    four_grams = equip['n_gram_freq']['four_grams']

    # for one_gram, score in one_grams.items():
    #     if score > gram_max_score:
    #         gram_max_score = score
    #         gram_max = one_gram

    for two_gram, score in two_grams.items():
        if score > gram_max_score:
            gram_max_score = score
            gram_max = two_gram

    for three_gram, score in three_grams.items():
        if score > gram_max_score:
            gram_max_score = score
            gram_max = three_gram

    for four_gram, score in four_grams.items():
        if score > gram_max_score:
            gram_max_score = score
            gram_max = four_gram

    if gram_max and gram_max_score >= 0.0011117287381878821  and gram_max_score <= 0.0070409486751899205:
        maxes.append(gram_max_score)
        equip['grouping_category'] = ' '.join(gram_max)
    else:
        equip['grouping_category'] = None

In [93]:
min_thres, max_thres = np.percentile(maxes, 20), np.percentile(maxes, 80)
min_thres, max_thres

(np.float64(0.0018528812303131369), np.float64(0.005743931813970725))

In [91]:
len([e['grouping_category'] for e in equipments_for_analysis])

3589

In [92]:
[(e['grouping_category'], e['name']) for e in equipments_for_analysis]

[(None, 'inner outer thigh cl-3800'),
 ('pec fly', 'pec fly rear delt cl-3309'),
 (None, 'abdominals cl-3601'),
 (None, 'leg press cl-3403'),
 (None, 'chest press cl-3301'),
 (None, 'shoulder press cl-3501'),
 (None, 'leg curl cl-3402'),
 ('lateral raise', 'lateral raise cl-3502'),
 (None, 'mid row cl-3203'),
 (None, 'lat pulldown cl-3201'),
 (None, 'leg extension cl-3401'),
 ('biceps curl', 'biceps curl cl-3102'),
 ('triceps extension', 'triceps extension cl-3103'),
 (None, 'standing prone leg curl cl-3408'),
 (None, 'back hyper cf-3663'),
 ('flat bench', 'flat bench cf-3163'),
 ('incline bench', 'flat incline bench cf-3160'),
 (None, 'kneeling leg curl cf-3411'),
 ('incline bench', 'olympic incline bench with storage cf-3172-a'),
 ('barbell rack', 'barbell rack cf-3465'),
 (None, 'glute thrust cf-3416'),
 ('olympic plate', '4-sided olympic plate tree cf-3444'),
 (None, '2-tier horizontal beauty bell rack cf-3462-2'),
 ('decline bench', 'super flat decline bench cf-3162'),
 (None, 'ha

In [57]:
[e['name'] for e in equipments_for_analysis]

['inner outer thigh cl-3800',
 'pec fly rear delt cl-3309',
 'abdominals cl-3601',
 'leg press cl-3403',
 'chest press cl-3301',
 'shoulder press cl-3501',
 'leg curl cl-3402',
 'lateral raise cl-3502',
 'mid row cl-3203',
 'lat pulldown cl-3201',
 'leg extension cl-3401',
 'biceps curl cl-3102',
 'triceps extension cl-3103',
 'standing prone leg curl cl-3408',
 'back hyper cf-3663',
 'flat bench cf-3163',
 'flat incline bench cf-3160',
 'kneeling leg curl cf-3411',
 'olympic incline bench with storage cf-3172-a',
 'barbell rack cf-3465',
 'glute thrust cf-3416',
 '4-sided olympic plate tree cf-3444',
 '2-tier horizontal beauty bell rack cf-3462-2',
 'super flat decline bench cf-3162',
 'half rack cf-3365',
 'incline leverage row cf-3661-a',
 'dual action smith cf-3754',
 'utility stool cf-3950',
 'olympic plate tree cf-3443',
 '3-tier horizontal beauty bell rack cf-3462-3',
 'power squat cf-3359',
 'utility bench cf-3960',
 'military press cf-3860',
 '3-way olympic bench cf-2179-b',
 