In [1]:
import string
import os
import json
from renetti.ws.spiders.types import ScrapedEquipment
from typing import List, Tuple, Optional, TypedDict, Dict
from collections import defaultdict
from nltk.util import ngrams
from nltk.corpus import stopwords
import numpy as np
import webcolors
import html
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
[lemmatizer.lemmatize(s, pos=wordnet.NOUN) for s in "prowler biceps sleds give".split(" ")]

['prowler', 'biceps', 'sled', 'give']

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
equipment_brands = {
    'mts',
    "monster lite",
    "monster",
    "r2",
 'abs company',
 'airex',
 'apus sports',
 'atlantis',
 'attack fitness',
 'bandbell',
 'plate loaded',
 'bear komplex',
 'bison designs',
 'chalk pot',
 'concept 2',
 'elite iso',
 'curve runner',
 'cybex',
 'drax',
 'dynepic sports',
 'eleiko',
 'escape fitness',
 'exigo',
 'future',
 'ghost',
 'glutebuilder®',
 'elite commercial',
 'goat tape',
 'gym gear',
 'half human',
 'hammer strength',
 'hatton boxing',
 'hoist',
 'hoist fitness',
 'hoist lemond series',
 'hybrid',
 'impulse',
 'indigo fitness',
 'inspire fitness',
 'home use',
 'jacobs ladder',
 'jerkfit',
 'jordan',
 'jordan fitness',
 'kabuki strength',
 'life fitness',
 'lifeline usa',
 'alpha commercial',
 'lionheart lifting',
 'matrix fitness',
 'mutant metals',
 'nautilus',
 'noble-pro',
 'nohrd',
 'premium line',
 'octane fitness',
 'oso',
 'per4m',
 'physical company',
 'pioneer',
 'precor',
 'primal strength',
 'primo',
 'proactive',
 'pulse fitness',
 'reebok',
 'rogue fitness',
 'rumbleroller',
 'schwinn',
 'scifit',
 'spirit',
 "welliv",
 "welliv pro",
 'spirit fitness',
 'spud inc',
 'stairmaster',
 'star trac',
 'stil-fit',
 'stroops',
 'super training products',
 'technogym',
 'throwdown',
 'torque usa',
 'true fitness',
 'uksf',
 'unbranded',
 'wolverson fitness',
 'woodway',
 'ziva',
 "club series",
 "circuit series",
 "club line"
 "prestera",
 "muscle",
 "axiom series", "club line", "dual series", "dual use", "dual series", 'elite commercial', 'elite series', 'integrity series', 'insignia series', 'performance series', 'pro series', 'series', 'signature series', 'console', 'touch', 'screen',
 "primal pro", "primal performance", "discover", 'hd athletic',
 'hd elite', 'signature', 'premium', "indoor", "wall mounted", "inclusive", 'plate-loaded', 'light commercial', 'iso lateral'
}

In [6]:
class Equipment(TypedDict):
    name: str
    image_links: List[str]
    mpn: Optional[List[str]]
    description: Optional[str]
    brands: Optional[List[str]]
    categories: Optional[List[str]]
    skus: Optional[List[str]]


class NGramFrequencyProp(TypedDict):
    one_grams: Dict[str, float]
    two_grams: Dict[str, float]
    three_grams: Dict[str, float]
    four_grams: Dict[str, float]

class EquipmentForAnalysis(TypedDict):
    name: str
    image_links: List[str]
    mpn: Optional[List[str]]
    description: Optional[str]
    brands: Optional[List[str]]
    categories: Optional[List[str]]
    skus: Optional[List[str]]
    one_grams: List[str]
    two_grams: List[str]
    three_grams: List[str]
    four_grams: List[str]
    n_gram_freq: Optional[NGramFrequencyProp]
    grouping_category: Optional[str]


In [7]:
scraped_data_file_paths = [
    f"{root}/scraped_data.json" for root, dirs, file_paths in os.walk("../files")
][1:] # remove root file dir

In [8]:
colors = set(webcolors.names() + ['graphite', 'zinc', 'color', 'colour'])

In [9]:
def is_a_number(token: str) -> bool:
    try:
        float(token)
        return True
    except ValueError:
        return False

In [10]:
def clean_equipment_name(name: str) -> str:
    # Decode HTML entities
    name = html.unescape(name)

    name = name.lower()
    name = name.replace('"', "").replace("'", "")

    # Remove punctuation
    pattern = f"[{re.escape(string.punctuation.replace("-", ""))}]"
    name = re.sub(pattern, "", name)

    # Remove weight/size units
    pattern = r"\b\d*\s*(kg|lbs?|cm|in|mm|ft)\b|\b\d+\s*(kg|lbs?|cm|in|mm|ft)\b"
    name = re.sub(pattern, "", name)

    # Remove unwanted characters, including isolated or numeric-bound hyphens
    name = re.sub(r"(?<![a-zA-Z0-9])-|-(?![a-zA-Z0-9])", "", name)
    name = re.sub(r'\s+', ' ', name).strip()

    # Remove brand names
    for brand in equipment_brands:
        name = name.replace(brand.lower(), "")

    # Tokenize, remove stop words, punctuation, colors, and numbers
    name_tokens = name.split(" ")
    filtered_tokens = [
        lemmatizer.lemmatize(token, pos=wordnet.NOUN) for token in name_tokens
        if (
            token not in stop_words and
            token not in colors and
            not is_a_number(token)
        )
    ]

    final_name = ' '.join(filtered_tokens)
    return final_name.replace(" - ", "-").strip().replace("   ", " ").replace("  ", " ").replace("triceps", "tricep").replace("biceps", "bicep").replace('elite commercial', '')


In [11]:
clean_equipment_name(
    "'Hello my name is Dan & with that I give you me()-(ppp) red blue green graphite 2.0 999 2.1311 life fitness 10kg kg &#x2b; colour color, balls hols 8ft 20mm / a-a 5-stack \"/" +
    "mm 25mm 25 MM"
)

'hello name dan give me-ppp ball hols a-a 5-stack'

In [12]:
equipment_data = defaultdict(list)
equipment_data_missing_fields = defaultdict(list)

for file_path in scraped_data_file_paths:
    with open(file=file_path) as f:
        data: List[ScrapedEquipment] = json.load(f)
        for equipment_obj in data:
            equipment_obj['name'] = clean_equipment_name(equipment_obj['name'])

            brands = equipment_obj["brands"]
            name = equipment_obj['name']

            if len(brands) != 1:
                equipment_data_missing_fields[name.title()].append(equipment_obj)
            else:
                brand = brands[0]
                unique_name = f"({brand.title()}) {name.title()}"
                equipment_data[unique_name].append(equipment_obj)

In [13]:
def merge_duplicates(objs: List[ScrapedEquipment]) -> Equipment:
    name = None
    image_links = set()
    mpn = set()
    description = None
    brands = set()
    categories = set()
    skus = set()

    for obj in objs:
        if name is None:
            name = obj["name"]

        image_links.update(obj["image_links"])
        if obj["mpn"]:
            mpn.add(obj["mpn"])

        brands.update([b.lower() for b in obj["brands"]] or [])
        categories.update([c.lower() for c in obj["categories"]] or [])
        skus.update(obj["skus"] or [])

        if not description or (obj["description"] and len(description) < len(obj["description"])):
            description = obj["description"]

    return {
        'name': name,
        'image_links': list(image_links),
        'mpn': list(map(str.title, mpn)),
        'description': description,
        'brands': list(map(str.title, brands)),
        'categories': list(map(str.title, categories)),
        'skus': list(skus)
    }

In [14]:
dupped_equipment = {k:v for k, v in equipment_data.items() if len(v) > 1}
print(len(dupped_equipment))

481


In [15]:
de_deupped_equipment = [merge_duplicates(objs=objs) for key, objs in equipment_data.items()]

In [16]:
with open("../files/cleaned_scraped_equipment.json", "w") as f:
    json.dump(de_deupped_equipment, f, indent=3)

In [17]:
def generate_n_grams(name: str) -> Tuple:
    tokens = name.split(" ")
    return [
        list(ngrams(tokens, n=n))
        for n in range(1, 5)
    ]

In [18]:
n_grams = defaultdict(lambda: defaultdict(float))
equipments_for_analysis: List[EquipmentForAnalysis] = []
for obj in de_deupped_equipment:
    one_grams, two_grams, three_grams, four_grams = generate_n_grams(name=obj["name"])
    equipment_for_analysis: EquipmentForAnalysis = {
        'name': obj['name'],
        'image_links': obj['image_links'],
        'mpn': obj['mpn'],
        'description': obj['description'],
        'brands': obj['brands'],
        'categories': obj['categories'],
        'skus': obj['skus'],
        'one_grams': one_grams,
        'two_grams': two_grams,
        'three_grams': three_grams,
        'four_grams': four_grams,
    }
    for gram in one_grams:
        n_grams['one_grams'][gram] += 1
    for gram in two_grams:
        n_grams['two_grams'][gram] += 1
    for gram in three_grams:
        n_grams['three_grams'][gram] += 1
    for gram in four_grams:
        n_grams['one_grams'][gram] += 1
    equipments_for_analysis.append(equipment_for_analysis)

In [19]:
for equip in equipments_for_analysis:
    equip['n_gram_freq'] = {
        'one_grams': {v:n_grams['one_grams'][v]/len(n_grams['one_grams']) for v in equip['one_grams']},
        'two_grams': {v:n_grams['two_grams'][v]/len(n_grams['two_grams']) for v in equip['two_grams']},
        'three_grams': {v:n_grams['three_grams'][v]/len(n_grams['three_grams']) for v in equip['three_grams']},
        'four_grams': {v:n_grams['four_grams'][v]/len(n_grams['four_grams']) for v in equip['four_grams']},
    }

In [20]:
scores = []
for equip in equipments_for_analysis:
    category = None

    two_grams = equip['n_gram_freq']['two_grams']
    three_grams = equip['n_gram_freq']['three_grams']
    four_grams = equip['n_gram_freq']['four_grams']

    for two_gram, score in two_grams.items():
        scores.append(score)

    for three_gram, score in three_grams.items():
        scores.append(score)

    for four_gram, score in four_grams.items():
        scores.append(score)

min_thres, max_thres = float(np.percentile(scores, 90)), float(np.percentile(scores, 100))
min_thres, max_thres

(0.003845167905665214, 0.021789284798769546)

In [21]:
maxes = []
for equip in equipments_for_analysis:
    category = None


    gram_max = None
    gram_max_score = 0

    two_grams = equip['n_gram_freq']['two_grams']
    three_grams = equip['n_gram_freq']['three_grams']
    four_grams = equip['n_gram_freq']['four_grams']

    for two_gram, score in two_grams.items():
        if score > gram_max_score:
            gram_max_score = score
            gram_max = two_gram

    for three_gram, score in three_grams.items():
        if score > gram_max_score:
            gram_max_score = score
            gram_max = three_gram

    for four_gram, score in four_grams.items():
        if score > gram_max_score:
            gram_max_score = score
            gram_max = four_gram

    if gram_max and gram_max_score >= min_thres:
        maxes.append(gram_max_score)
        equip['grouping_category'] = ' '.join(gram_max)
    else:
        equip['grouping_category'] = None

In [22]:
len([e['grouping_category'] for e in equipments_for_analysis])

3279

In [23]:
[(e['grouping_category'], e['name']) for e in equipments_for_analysis if 'xf sle' in e['name']]

[(None, 'xf sled prowler heavy'), (None, 'xf sled prowler light')]

In [24]:
len({e['grouping_category'] for e in equipments_for_analysis})

46

In [25]:
{(e['grouping_category']) for e in equipments_for_analysis}

{None,
 'abdominal crunch',
 'adjustable bench',
 'adjustable pulley',
 'back extension',
 'barbell rack',
 'bench press',
 'bicep curl',
 'bumper plate',
 'chest press',
 'chin dip',
 'cross trainer',
 'curl bar',
 'curl bench',
 'decline bench',
 'dumbbell rack',
 'ez curl',
 'flat bench',
 'functional trainer',
 'half rack',
 'hyper extension',
 'incline bench',
 'lat pulldown',
 'leg curl',
 'leg extension',
 'leg press',
 'low row',
 'olympic bar',
 'olympic bench',
 'olympic plate',
 'pec fly',
 'pin select',
 'plate storage',
 'power rack',
 'preacher curl',
 'pull-up bar',
 'rear delt',
 'recumbent bike',
 'seated row',
 'shoulder press',
 'smith machine',
 'squat rack',
 'squat stand',
 'storage rack',
 'tricep extension',
 'upright bike'}

In [26]:
category = defaultdict(int)
for equip in equipments_for_analysis:
    category[equip['grouping_category']] += 1

In [27]:
sorted_data = sorted(category.items(), key=lambda item: item[1], reverse=True)
sorted_data

[(None, 2033),
 ('leg curl', 84),
 ('chest press', 77),
 ('leg press', 66),
 ('lat pulldown', 65),
 ('shoulder press', 55),
 ('dumbbell rack', 50),
 ('leg extension', 42),
 ('bicep curl', 39),
 ('functional trainer', 35),
 ('half rack', 32),
 ('flat bench', 31),
 ('olympic bar', 31),
 ('power rack', 30),
 ('bumper plate', 27),
 ('low row', 27),
 ('adjustable pulley', 27),
 ('seated row', 27),
 ('incline bench', 26),
 ('decline bench', 26),
 ('cross trainer', 26),
 ('preacher curl', 24),
 ('abdominal crunch', 24),
 ('recumbent bike', 24),
 ('pec fly', 23),
 ('olympic bench', 23),
 ('pull-up bar', 23),
 ('adjustable bench', 23),
 ('smith machine', 23),
 ('squat stand', 22),
 ('back extension', 22),
 ('upright bike', 22),
 ('chin dip', 21),
 ('tricep extension', 18),
 ('curl bar', 18),
 ('barbell rack', 15),
 ('olympic plate', 15),
 ('squat rack', 15),
 ('hyper extension', 15),
 ('storage rack', 14),
 ('plate storage', 11),
 ('pin select', 9),
 ('bench press', 8),
 ('ez curl', 5),
 ('rear

In [28]:
# Things I could improve
    # Single and Triple n-gram selection
    # There's very few of these. E.g. Treadmill or "plate loaded row" (We did remove plate loaded but this is an example)
    # We should come up with a mechanism of including these to improve categories later on|