In [11]:
import os
import json
from renetti.ws.spiders.types import ScrapedEquipment
from typing import List, Tuple, Optional, TypedDict, Dict
from collections import defaultdict
from nltk.util import ngrams

In [12]:
class Equipment(TypedDict):
    name: str
    image_links: List[str]
    mpn: Optional[List[str]]
    description: Optional[str]
    brands: Optional[List[str]]
    categories: Optional[List[str]]
    skus: Optional[List[str]]


class NGramFrequencyProp(TypedDict):
    one_grams: Dict[str, float]
    two_grams: Dict[str, float]
    three_grams: Dict[str, float]
    four_grams: Dict[str, float]

class EquipmentForAnalysis(TypedDict):
    name: str
    image_links: List[str]
    mpn: Optional[List[str]]
    description: Optional[str]
    brands: Optional[List[str]]
    categories: Optional[List[str]]
    skus: Optional[List[str]]
    one_grams: List[str]
    two_grams: List[str]
    three_grams: List[str]
    four_grams: List[str]
    n_gram_freq: Optional[NGramFrequencyProp]
    grouping_category: Optional[str]


In [13]:
scraped_data_file_paths = [
    f"{root}/scraped_data.json" for root, dirs, file_paths in os.walk("../files")
][1:] # remove root file dir

In [14]:
equipment_data = defaultdict(list)
equipment_data_missing_fields = defaultdict(list)

for file_path in scraped_data_file_paths:
    with open(file=file_path) as f:
        data: List[ScrapedEquipment] = json.load(f)
        for equipment_obj in data:
            brands = equipment_obj["brands"]
            name = equipment_obj['name'].lower()

            if len(brands) != 1:
                equipment_data_missing_fields[name.title()].append(equipment_obj)
            else:
                brand = brands[0]
                unique_name = f"({brand.title()}) {name.title()}"
                equipment_data[unique_name].append(equipment_obj)

In [15]:
def merge_duplicates(objs: List[ScrapedEquipment]) -> Equipment:
    name = None
    image_links = set()
    mpn = set()
    description = None
    brands = set()
    categories = set()
    skus = set()

    for obj in objs:
        if name is None:
            name = obj["name"]

        image_links.update(obj["image_links"])
        if obj["mpn"]:
            mpn.add(obj["mpn"])

        brands.update([b.lower() for b in obj["brands"]] or [])
        categories.update([c.lower() for c in obj["categories"]] or [])
        skus.update(obj["skus"] or [])

        if not description or (obj["description"] and len(description) < len(obj["description"])):
            description = obj["description"]

    return {
        'name': name,
        'image_links': list(image_links),
        'mpn': list(map(str.title, mpn)),
        'description': description,
        'brands': list(map(str.title, brands)),
        'categories': list(map(str.title, categories)),
        'skus': list(skus)
    }

In [16]:
dupped_equipment = {k:v for k, v in equipment_data.items() if len(v) > 1}
print(len(dupped_equipment))

248


In [17]:
de_deupped_equipment = [merge_duplicates(objs=objs) for key, objs in equipment_data.items()]

In [18]:
with open("../files/cleaned_scraped_equipment.json", "w") as f:
    json.dump(de_deupped_equipment, f, indent=3)

In [19]:
def generate_n_grams(name: str) -> Tuple:
    tokens = name.split(" ")
    return [
        list(ngrams(tokens, n=n))
        for n in range(1, 5)
    ]

In [20]:
n_grams = defaultdict(lambda: defaultdict(float))
equipments_for_analysis: List[EquipmentForAnalysis] = []
for obj in de_deupped_equipment:
    one_grams, two_grams, three_grams, four_grams = generate_n_grams(name=obj["name"])
    equipment_for_analysis: EquipmentForAnalysis = {
        'name': obj['name'],
        'image_links': obj['image_links'],
        'mpn': obj['mpn'],
        'description': obj['description'],
        'brands': obj['brands'],
        'categories': obj['categories'],
        'skus': obj['skus'],
        'one_grams': one_grams,
        'two_grams': two_grams,
        'three_grams': three_grams,
        'four_grams': four_grams,
    }
    for gram in one_grams:
        n_grams['one_grams'][gram] += 1
    for gram in two_grams:
        n_grams['two_grams'][gram] += 1
    for gram in three_grams:
        n_grams['three_grams'][gram] += 1
    for gram in four_grams:
        n_grams['one_grams'][gram] += 1
    equipments_for_analysis.append(equipment_for_analysis)

In [21]:
for equip in equipments_for_analysis:
    equip['n_gram_freq'] = {
        'one_grams': {v:n_grams['one_grams'][v]/len(n_grams['one_grams']) for v in equip['one_grams']},
        'two_grams': {v:n_grams['two_grams'][v]/len(n_grams['two_grams']) for v in equip['two_grams']},
        'three_grams': {v:n_grams['three_grams'][v]/len(n_grams['three_grams']) for v in equip['three_grams']},
        'four_grams': {v:n_grams['four_grams'][v]/len(n_grams['four_grams']) for v in equip['four_grams']},
    }

In [22]:
for equip in equipments_for_analysis:
    category = None

    two_gram_max = None
    three_gram_max = None
    two_gram_max_score = 0
    three_gram_max_score = 0

    two_grams = equip['n_gram_freq']['two_grams']
    three_grams = equip['n_gram_freq']['three_grams']
    if len(two_grams) == 0:
        equip['grouping_category'] = None
        continue

    for two_gram, score in two_grams.items():
        if score > two_gram_max_score:
            two_gram_max_score = score
            two_gram_max = two_gram

    for three_gram, score in three_grams.items():
        if score > three_gram_max_score:
            three_gram_max_score = score
            three_gram_max = two_gram

    if two_gram_max_score > three_gram_max_score:
        equip['grouping_category'] = ' '.join(two_gram_max)
    else:
        equip['grouping_category'] = ' '.join(three_gram_max)

In [24]:
[e['grouping_category'] for e in equipments_for_analysis]

['Outer Thigh',
 'Pec Fly',
 'Abdominals CL-3601',
 'Leg Press',
 'Chest Press',
 'Shoulder Press',
 'Leg Curl',
 'Lateral Raise',
 'Mid Row',
 'Lat Pulldown',
 'Leg Extension',
 'Biceps Curl',
 'Triceps Extension',
 'Leg Curl',
 'Back Hyper',
 'Flat Bench',
 'Flat/Incline Bench',
 'Leg Curl',
 'Incline Bench',
 'Barbell Rack',
 'Glute Thrust',
 'Olympic Plate',
 '2-Tier Horizontal',
 'Super Flat/Decline',
 'Half Rack',
 'Incline Leverage',
 'Dual Action',
 'Utility Stool',
 'Olympic Plate',
 '3-Tier Horizontal',
 'Power Squat',
 'Utility Bench',
 'Military Press',
 'Olympic Bench',
 'Fitness Tree',
 'Squat Rack',
 'Super Flat/Incline/Decline',
 'Dumbbell Rack',
 'Adjustable Decline',
 'Decline Bench',
 'Dumbbell Rack',
 'Preacher Curl',
 'Leg Press',
 'Flat Bench',
 'Accessory Rack',
 'Power Cage',
 'Dual Angle',
 'Dumbbell Rack',
 'Hack Squat',
 '/ Dip',
 'Preacher Curl',
 'HOIST LeMond',
 'REVMASTER PRO',
 'HOIST LeMond',
 'HOIST LeMond',
 'Inner/Outer Thigh',
 'Leg Press/Calf',
 'C