<a href="https://colab.research.google.com/github/Damjanv1/Sideprojects/blob/main/Workouts_Wayback_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install libraries needed
pip install requests beautifulsoup4 pandas



# Workouts (WOD)

In [None]:
####### this is the code that ran
### code to scraoe data for timestamp 20180210001310 (return and change to get other workouts)
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

class WorkoutScraper:
    def __init__(self):
        self.base_url = "https://web.archive.org/web/"
        self.workouts = []
        self.categories = {
            '94': 'Strength WODs',
            '95': 'Skill WODs',
            '96': 'Flexibility WODs',
            '97': 'MetCon WODs'
        }
        self.timestamps = [
            "20180210001310",
            "20180818133346",
            "20130806050456",
            "20130801000000",
            "20130701000000"
        ]

    def get_valid_timestamp(self, focus_tid):
        """Find a working timestamp for this category"""
        for timestamp in self.timestamps:
            url = f"{self.base_url}{timestamp}/http://gymnasticswod.com/wod?field_focus_tid={focus_tid}"
            try:
                print(f"Testing timestamp {timestamp} for category {focus_tid}...")
                response = requests.get(url, timeout=10)
                if response.status_code == 200:
                    print(f"Found working timestamp: {timestamp}")
                    return timestamp
                time.sleep(5)  # Delay between timestamp tests
            except requests.RequestException:
                continue
        return self.timestamps[0]  # Default to first timestamp if none work

    def get_page_content(self, url, retry_count=3):
        """Get page content with error handling"""
        for attempt in range(retry_count):
            try:
                print(f"Fetching: {url}")
                response = requests.get(url, timeout=15)
                response.raise_for_status()
                if response.status_code == 200:
                    return response.text
            except requests.RequestException as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt + 1 == retry_count:
                    print(f"Failed to fetch {url}")
                    return None
                delay = (2 ** attempt) * 5 + random.uniform(1, 5)
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
        return None

    def clean_workout_text(self, text):
        """Clean up the workout text"""
        text = text.replace('Workout', '')
        text = text.replace('WorkoutFor', 'For')
        text = text.split('Add new comment')[0]
        text = text.split('Post your experience')[0]
        lines = [line.strip() for line in text.split('\n')]
        lines = [line for line in lines if line and not line.isspace()]
        return '\n'.join(lines)

    def parse_workout(self, soup, category):
        """Parse workout information from the page"""
        workouts = []
        sections = soup.find_all(['h2', 'h3'])

        for section in sections:
            if any(day in section.text.lower() for day in ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']):
                date = section.text.strip()
                workout_content = None
                current = section.find_next_sibling()

                while current and not workout_content:
                    if 'Workout' in current.text:
                        workout_content = current.text
                    current = current.find_next_sibling()

                if workout_content:
                    clean_content = self.clean_workout_text(workout_content)

                    workout_type = ''
                    if 'FOR QUALITY' in clean_content.upper():
                        workout_type = 'FOR QUALITY'
                    elif 'ROUNDS' in clean_content.upper():
                        workout_type = 'ROUNDS'
                    elif 'AMRAP' in clean_content.upper():
                        workout_type = 'AMRAP'
                    elif 'FOR TIME' in clean_content.upper():
                        workout_type = 'FOR TIME'

                    workout = {
                        'date': date,
                        'category': category,
                        'workout_type': workout_type,
                        'workout': clean_content
                    }

                    workouts.append(workout)
                    print(f"Found workout for {date}")

        return workouts

    def scrape_category(self, focus_tid, category_name):
        """Scrape all pages for a given category"""
        timestamp = self.get_valid_timestamp(focus_tid)
        page = 0
        category_count = 0

        while True:
            base_url = f"http://gymnasticswod.com/wod?field_focus_tid={focus_tid}"
            if page > 0:
                base_url += f"&sort_by=created&sort_order=DESC&page={page}"

            url = f"{self.base_url}{timestamp}/{base_url}"
            print(f"\nProcessing {category_name} - Page {page}")

            content = self.get_page_content(url)
            if not content:
                break

            soup = BeautifulSoup(content, 'html.parser')
            page_workouts = self.parse_workout(soup, category_name)

            if not page_workouts:
                print(f"No more workouts found on page {page}")
                break

            self.workouts.extend(page_workouts)
            category_count += len(page_workouts)
            print(f"Found {len(page_workouts)} workouts on page {page}")
            print(f"Total {category_name} workouts: {category_count}")

            next_page = soup.find('a', text='next ›')
            if not next_page:
                print("No more pages found")
                break

            page += 1
            delay = random.uniform(8, 12)
            print(f"Waiting {delay:.1f} seconds before next page...")
            time.sleep(delay)

    def scrape_all_categories(self):
        """Scrape workouts from all categories"""
        for focus_tid, category_name in self.categories.items():
            print(f"\nScraping {category_name}...")
            initial_count = len(self.workouts)
            self.scrape_category(focus_tid, category_name)
            final_count = len(self.workouts)
            print(f"Added {final_count - initial_count} {category_name}")

            delay = random.uniform(15, 20)
            print(f"Waiting {delay:.1f} seconds before next category...")
            time.sleep(delay)

    def save_to_csv(self, filename='workouts.csv'):
        """Save workouts to CSV file"""
        df = pd.DataFrame(self.workouts)
        df.to_csv(filename, index=False)
        print(f"\nSaved {len(self.workouts)} workouts to {filename}")
        print("\nWorkouts by category:")
        print(df['category'].value_counts())
        print("\nWorkout types:")
        print(df['workout_type'].value_counts())

def main():
    scraper = WorkoutScraper()
    try:
        print("Starting full workout scrape...")
        scraper.scrape_all_categories()
        scraper.save_to_csv()

    except KeyboardInterrupt:
        print("\nScraping interrupted by user!")
        if scraper.workouts:
            scraper.save_to_csv('workouts_partial.csv')
            print("Partial results saved!")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        if scraper.workouts:
            scraper.save_to_csv('workouts_error.csv')
            print("Partial results saved due to error!")

if __name__ == "__main__":
    main()

Starting full workout scrape...

Scraping Strength WODs...
Testing timestamp 20180210001310 for category 94...
Found working timestamp: 20180210001310

Processing Strength WODs - Page 0
Fetching: https://web.archive.org/web/20180210001310/http://gymnasticswod.com/wod?field_focus_tid=94
Found workout for Tue 12/29
Found workout for Fri 12/25
Found workout for Mon 12/21
Found workout for Thu 12/17
Found workout for Sun 12/13
Found workout for Wed 12/09
Found workout for Sat 12/05
Found workout for Tue 12/01
Found workout for Fri 11/27
Found workout for Mon 11/23
Found 10 workouts on page 0
Total Strength WODs workouts: 10
Waiting 8.1 seconds before next page...


  next_page = soup.find('a', text='next ›')



Processing Strength WODs - Page 1
Fetching: https://web.archive.org/web/20180210001310/http://gymnasticswod.com/wod?field_focus_tid=94&sort_by=created&sort_order=DESC&page=1
Found workout for Thu 11/19
Found workout for Sun 11/15
Found workout for Wed 11/11
Found workout for Sat 11/07
Found workout for Tue 11/03
Found workout for Fri 10/30
Found workout for Mon 10/25
Found workout for Thu 10/22
Found workout for Sun 10/18
Found workout for Wed 10/14
Found 10 workouts on page 1
Total Strength WODs workouts: 20
Waiting 8.1 seconds before next page...

Processing Strength WODs - Page 2
Fetching: https://web.archive.org/web/20180210001310/http://gymnasticswod.com/wod?field_focus_tid=94&sort_by=created&sort_order=DESC&page=2
Found workout for Sat 10/10
Found workout for Tue 10/06
Found workout for Fri 10/02
Found workout for Mon 09/28
Found workout for Thu 09/24
Found workout for Sun 09/20
Found workout for Wed 09/16
Found workout for Sat 09/12
Found workout for Tue 09/08
Found workout for

Check the quality of the data produced by the scraping script

In [None]:
import os
import pandas as pd

# First, let's check if the file exists in the current directory
print("Checking current directory...")
print(f"Current working directory: {os.getcwd()}")
print("\nFiles in current directory:")
for file in os.listdir():
    if file.endswith('.csv'):
        print(f"Found CSV: {file} (Size: {os.path.getsize(file)} bytes)")

        # If we find workouts.csv, let's look at its contents
        if file == 'workouts.csv':
            df = pd.read_csv(file)
            print(f"\nWorkouts.csv contains {len(df)} rows")
            print("\nColumns:", df.columns.tolist())
            print("\nFirst few rows:")
            print(df.head())

            # Show distribution of workout categories
            print("\nWorkouts by category:")
            print(df['category'].value_counts())

Checking current directory...
Current working directory: /content

Files in current directory:
Found CSV: workouts.csv (Size: 126414 bytes)

Workouts.csv contains 616 rows

Columns: ['date', 'category', 'workout_type', 'workout']

First few rows:
        date       category workout_type  \
0  Tue 12/29  Strength WODs  FOR QUALITY   
1  Fri 12/25  Strength WODs  FOR QUALITY   
2  Mon 12/21  Strength WODs  FOR QUALITY   
3  Thu 12/17  Strength WODs  FOR QUALITY   
4  Sun 12/13  Strength WODs  FOR QUALITY   

                                             workout  
0  FOR QUALITY: 20 Meter Handstand Walk Laterals ...  
1  For QUALITYPerform In Any Order:30 Pull Up in ...  
2  For QUALITY3 ROUNDS: 6 V Outs  6  Ring Straigh...  
3  FOR QUALITY: Accumulate 100 Meter Handstand Walks  
4  For QUALITY3 ROUNDS: 20  Back Leg Swings  Left...  

Workouts by category:
category
Strength WODs       170
Flexibility WODs    170
Skill WODs          147
MetCon WODs         129
Name: count, dtype: int64
Foun

In [None]:
## more code to view the results and see wether everything is as expected before moving to G Sheets
import pandas as pd

def check_data():
    print("Reading workouts.csv...")
    df = pd.read_csv('workouts.csv')

    print(f"\nTotal workouts: {len(df)}")

    print("\nWorkouts by category:")
    print(df['category'].value_counts())

    print("\nWorkout types:")
    print(df['workout_type'].value_counts())

    print("\nDate range:")
    print(f"Earliest: {df['date'].min()}")
    print(f"Latest: {df['date'].max()}")

    print("\nSample workout from each category:")
    for category in df['category'].unique():
        sample = df[df['category'] == category].iloc[0]
        print(f"\n{category}:")
        print(f"Date: {sample['date']}")
        print(f"Type: {sample['workout_type']}")
        print("Workout preview:")
        print(sample['workout'][:200], "...")

check_data()

Reading workouts.csv...

Total workouts: 616

Workouts by category:
category
Strength WODs       170
Flexibility WODs    170
Skill WODs          147
MetCon WODs         129
Name: count, dtype: int64

Workout types:
workout_type
FOR QUALITY    161
ROUNDS          72
FOR TIME        50
AMRAP           19
Name: count, dtype: int64

Date range:
Earliest: Fri 01/02
Latest: Wed 12/31

Sample workout from each category:

Strength WODs:
Date: Tue 12/29
Type: FOR QUALITY
Workout preview:
FOR QUALITY: 20 Meter Handstand Walk Laterals  Accumulate 3 Minutes Of L Seats  Accumulate 3 Minutes Of Supermans  Accumulate 3 Minutes Of L Hangs ...

Skill WODs:
Date: Mon 12/28
Type: nan
Workout preview:
PRACTICE for 10 minutes:Roll to Handstand Push Up Progression Pt.1  Roll to Handstand Push Up Progression Pt.2  Roll to Handstand Push Up Progression Pt.3 ...

Flexibility WODs:
Date: Thu 12/31
Type: nan
Workout preview:
FLEXIBILITY:Happy New Year! Accumulate 5 Minutes Of   Bridge Ups  Hold, Every Time You B

code to run to get into G sheets

In [None]:
## code used to get the workouts.csv reformatted and inserted into G SHeets
def upload_to_sheets():
    print("Setting up Google Sheets access...")
    drive.mount('/content/drive', force_remount=True)

    auth.authenticate_user()
    creds, _ = default()
    gc = gspread.authorize(creds)

    print("Reading and cleaning workout data...")
    df = pd.read_csv('workouts.csv')

    # Clean NaN values
    df['workout_type'] = df['workout_type'].fillna('Unspecified')
    df['workout'] = df['workout'].fillna('')

    print("\nCreating main spreadsheet...")
    workbook = gc.create('Gymnastics WODs Database')

    print("Creating Overview sheet...")
    overview = workbook.sheet1
    overview.update_title('Overview')

    overview_data = [
        ['Gymnastics WODs Database'],
        [''],
        ['Total Workouts:', str(len(df))],
        ['Date Range:', f"{df['date'].min()} to {df['date'].max()}"],
        [''],
        ['Workouts by Category'],
        ['Category', 'Count']
    ]

    category_counts = df['category'].value_counts()
    for category, count in category_counts.items():
        overview_data.append([category, str(count)])

    overview_data.extend([
        [''],
        ['Workout Types Distribution'],
        ['Type', 'Count']
    ])

    type_counts = df['workout_type'].value_counts()
    for wtype, count in type_counts.items():
        overview_data.append([wtype, str(count)])

    print("Updating Overview sheet...")
    overview.update('A1', overview_data)

    print("\nCreating All Workouts sheet...")
    all_workouts = workbook.add_worksheet(title='All Workouts', rows=len(df)+1, cols=4)

    # Prepare headers and data
    headers = ['Date', 'Category', 'Type', 'Workout']

    # Convert DataFrame to list of lists
    all_data = [headers]
    all_data.extend(df[['date', 'category', 'workout_type', 'workout']].fillna('').values.tolist())

    print(f"Uploading data to All Workouts sheet...")
    all_workouts.update('A1', all_data)

    # Create category sheets
    for category in df['category'].unique():
        print(f"\nCreating sheet for {category}...")
        category_df = df[df['category'] == category]

        sheet_name = category.replace(' ', '_')
        category_sheet = workbook.add_worksheet(title=sheet_name, rows=len(category_df)+1, cols=3)

        # Prepare category data
        category_data = [['Date', 'Type', 'Workout']]
        category_data.extend(category_df[['date', 'workout_type', 'workout']].fillna('').values.tolist())

        print(f"Uploading data to {category} sheet...")
        category_sheet.update('A1', category_data)

    print("\nSpreadsheet creation complete!")
    print(f"Spreadsheet URL: {workbook.url}")

    # Share the spreadsheet
    workbook.share(None, perm_type='anyone', role='reader')

    return workbook.url

try:
    print("Starting Google Sheets upload...")
    spreadsheet_url = upload_to_sheets()
    print(f"\nYour spreadsheet is available at: {spreadsheet_url}")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    traceback.print_exc()


Starting Google Sheets upload...
Setting up Google Sheets access...
Mounted at /content/drive
Reading and cleaning workout data...

Creating main spreadsheet...
Creating Overview sheet...
Updating Overview sheet...


  overview.update('A1', overview_data)



Creating All Workouts sheet...
Uploading data to All Workouts sheet...


  all_workouts.update('A1', all_data)



Creating sheet for Strength WODs...
Uploading data to Strength WODs sheet...


  category_sheet.update('A1', category_data)



Creating sheet for Skill WODs...
Uploading data to Skill WODs sheet...


  category_sheet.update('A1', category_data)



Creating sheet for Flexibility WODs...
Uploading data to Flexibility WODs sheet...


  category_sheet.update('A1', category_data)



Creating sheet for MetCon WODs...
Uploading data to MetCon WODs sheet...


  category_sheet.update('A1', category_data)



Spreadsheet creation complete!
Spreadsheet URL: https://docs.google.com/spreadsheets/d/1uQPeoOtOE5Yt3iKXuWyGQwnfCEo6h74LXYoUxuKnt5w

Your spreadsheet is available at: https://docs.google.com/spreadsheets/d/1uQPeoOtOE5Yt3iKXuWyGQwnfCEo6h74LXYoUxuKnt5w


# Movements

In [None]:
# first set of scripts was for the workouts - this script is for the exercise instructions and corresponding videos
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from urllib.parse import urljoin

class MovementScraper:
    def __init__(self):
        self.wayback_base = "https://web.archive.org"
        self.base_url = f"{self.wayback_base}/web/20180204210343/http://gymnasticswod.com/library"
        self.movements = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def get_page_content(self, url, retry_count=3):
        """Get page content with error handling"""
        # Ensure we have a full URL
        if not url.startswith('http'):
            url = urljoin(self.wayback_base, url)

        for attempt in range(retry_count):
            try:
                print(f"Fetching: {url}")
                response = requests.get(url, headers=self.headers, timeout=15)
                response.raise_for_status()
                if response.status_code == 200:
                    return response.text
            except requests.RequestException as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt + 1 == retry_count:
                    print(f"Failed to fetch {url}")
                    return None
                delay = (2 ** attempt) * 5 + random.uniform(1, 5)
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
        return None

    def extract_video_source(self, movement_url):
        """Extract video source from movement page"""
        content = self.get_page_content(movement_url)
        if not content:
            return None

        soup = BeautifulSoup(content, 'html.parser')
        video_iframe = soup.find('iframe')
        if video_iframe and 'src' in video_iframe.attrs:
            return video_iframe['src']
        return None

    def parse_metadata(self, metadata_text):
        """Parse category, apparatus, and movement from metadata string"""
        metadata = {
            'category': '',
            'apparatus': '',
            'movement_type': ''
        }

        if not metadata_text:
            return metadata

        try:
            parts = metadata_text.split('|')
            for part in parts:
                part = part.strip()
                if 'Category:' in part:
                    metadata['category'] = part.split('Category:')[1].strip()
                elif 'Apparatus:' in part:
                    metadata['apparatus'] = part.split('Apparatus:')[1].strip()
                elif 'Movement:' in part:
                    metadata['movement_type'] = part.split('Movement:')[1].strip()
        except Exception as e:
            print(f"Error parsing metadata: {str(e)}")
            print(f"Metadata text: {metadata_text}")

        return metadata

    def parse_movement(self, movement_row):
        """Parse individual movement information"""
        try:
            # Find the movement title
            title = movement_row.find('a')
            if not title:
                print("No title link found")
                return None

            movement_title = title.text.strip()
            movement_url = title['href']
            print(f"Found movement: {movement_title}")
            print(f"URL: {movement_url}")

            # Extract instructions
            instructions = []
            instruction_list = movement_row.find('ul')
            if instruction_list:
                instructions = [li.text.strip() for li in instruction_list.find_all('li')]

            # Get metadata
            metadata_div = movement_row.find(lambda tag: tag.name == 'div' and
                                          tag.text and
                                          ('Category:' in tag.text or
                                           'Apparatus:' in tag.text or
                                           'Movement:' in tag.text))
            metadata = self.parse_metadata(metadata_div.text if metadata_div else '')

            # Get video source
            video_src = self.extract_video_source(movement_url)

            movement_data = {
                'title': movement_title,
                'instructions': '\n'.join(instructions),
                'movement_url': movement_url,
                'video_src': video_src,
                'category': metadata['category'],
                'apparatus': metadata['apparatus'],
                'movement_type': metadata['movement_type']
            }

            print(f"Successfully parsed movement: {movement_title}")
            return movement_data

        except Exception as e:
            print(f"Error parsing movement: {str(e)}")
            return None

    def scrape_movements(self, max_pages=None):
        """Scrape all movements from the library"""
        page = 0

        while True:
            if max_pages is not None and page >= max_pages:
                print(f"Reached maximum number of pages ({max_pages})")
                break

            url = self.base_url
            if page > 0:
                url += f"?page={page}"

            print(f"\nProcessing page {page}")
            content = self.get_page_content(url)

            if not content:
                break

            soup = BeautifulSoup(content, 'html.parser')

            # Find the view-content container
            view_content = soup.find('div', class_='view-content')
            if not view_content:
                print("No view-content container found")
                break

            # Find all movement rows
            movement_rows = view_content.find_all('li', class_=lambda x: x and 'views-row' in x)

            print(f"\nFound {len(movement_rows)} potential movement rows")

            if not movement_rows:
                print("No movements found on this page")
                break

            movements_added = 0
            for row in movement_rows:
                movement_data = self.parse_movement(row)
                if movement_data:
                    self.movements.append(movement_data)
                    movements_added += 1

            print(f"Added {movements_added} movements from page {page}")
            print(f"Total movements scraped: {len(self.movements)}")

            # Check for next page
            next_page = soup.find('a', string='next ›')
            if not next_page:
                print("No more pages found")
                break

            page += 1
            delay = random.uniform(8, 12)
            print(f"Waiting {delay:.1f} seconds before next page...")
            time.sleep(delay)

    def save_to_csv(self, filename='movements.csv'):
        """Save movements to CSV file"""
        if not self.movements:
            print("No movements to save!")
            return

        df = pd.DataFrame(self.movements)
        df.to_csv(filename, index=False)
        print(f"\nSaved {len(self.movements)} movements to {filename}")

        if 'category' in df.columns:
            print("\nMovements by category:")
            print(df['category'].value_counts())

        if 'apparatus' in df.columns:
            print("\nMovements by apparatus:")
            print(df['apparatus'].value_counts())

def main():
    scraper = MovementScraper()
    try:
        print("Starting movement library scrape...")
        # Start with just 2 pages for testing
        scraper.scrape_movements(max_pages=2)
        scraper.save_to_csv()

    except KeyboardInterrupt:
        print("\nScraping interrupted by user!")
        if scraper.movements:
            scraper.save_to_csv('movements_partial.csv')
            print("Partial results saved!")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        if scraper.movements:
            scraper.save_to_csv('movements_error.csv')
            print("Partial results saved due to error!")

if __name__ == "__main__":
    main()

Starting movement library scrape...

Processing page 0
Fetching: https://web.archive.org/web/20180204210343/http://gymnasticswod.com/library

Found 10 potential movement rows
Found movement: Back Leg Swing
URL: /web/20180204210343/http://gymnasticswod.com/content/back-leg-swing
Fetching: https://web.archive.org/web/20180204210343/http://gymnasticswod.com/content/back-leg-swing
Successfully parsed movement: Back Leg Swing
Found movement: Back Lever
URL: /web/20180204210343/http://gymnasticswod.com/content/back-lever
Fetching: https://web.archive.org/web/20180204210343/http://gymnasticswod.com/content/back-lever
Successfully parsed movement: Back Lever
Found movement: Back Tuck
URL: /web/20180204210343/http://gymnasticswod.com/content/back-tuck
Fetching: https://web.archive.org/web/20180204210343/http://gymnasticswod.com/content/back-tuck
Successfully parsed movement: Back Tuck
Found movement: Back Uprise from Dip Swing
URL: /web/20180204210343/http://gymnasticswod.com/content/back-upris

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
from urllib.parse import urljoin

class MovementScraper:
    def __init__(self):
        self.wayback_base = "https://web.archive.org"
        self.base_url = f"{self.wayback_base}/web/20180202184709/http://gymnasticswod.com/library"
        self.movements = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive'
        }
        self.session = requests.Session()
        self.last_request_time = 0
        self.min_request_delay = 15  # Increased base delay

    def wait_between_requests(self):
        """Ensure minimum delay between requests with some randomization"""
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_request_delay:
            wait_time = self.min_request_delay - elapsed + random.uniform(1, 5)
            time.sleep(wait_time)
        self.last_request_time = time.time()

    def get_page_content(self, url, retry_count=5, initial_delay=30):
        """Get page content with improved error handling and rate limiting"""
        if not url.startswith('http'):
            url = urljoin(self.wayback_base, url)

        for attempt in range(retry_count):
            try:
                self.wait_between_requests()
                print(f"Fetching: {url}")
                response = self.session.get(url, headers=self.headers, timeout=30)

                if response.status_code == 429:  # Too Many Requests
                    delay = initial_delay * (attempt + 1)  # Increase delay with each attempt
                    print(f"Rate limited. Waiting {delay} seconds...")
                    time.sleep(delay)
                    continue

                response.raise_for_status()
                if response.status_code == 200:
                    return response.text

            except requests.RequestException as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt + 1 == retry_count:
                    print(f"Failed to fetch {url}")
                    return None

                delay = initial_delay * (2 ** attempt)  # Exponential backoff
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)

        return None

    def extract_video_source(self, movement_url):
        """Extract video source from movement detail page"""
        content = self.get_page_content(movement_url)
        if not content:
            return ''

        soup = BeautifulSoup(content, 'html.parser')

        # Find all iframes
        iframes = soup.find_all('iframe')
        print(f"Found {len(iframes)} iframes")

        for iframe in iframes:
            src = iframe.get('src', '')
            if not src or 'donate' in src:  # Skip donation iframe
                continue

            # Make sure we have a full URL
            if not src.startswith('http'):
                src = urljoin(self.wayback_base, src)

            # First try to get small size, then medium if small isn't available
            if 'size=small' in src:
                print(f"Found small video source: {src}")
                return src
            elif 'size=medium' in src:
                print(f"Found medium video source: {src}")
                return src

        return ''

    def parse_metadata(self, metadata_text):
        """Parse category, apparatus, and movement from metadata string"""
        metadata = {
            'category': '',
            'apparatus': '',
            'movement_type': ''
        }

        if not metadata_text:
            return metadata

        try:
            parts = metadata_text.split('|')
            for part in parts:
                part = part.strip()
                if 'Category:' in part:
                    metadata['category'] = part.split('Category:')[1].strip()
                elif 'Apparatus:' in part:
                    metadata['apparatus'] = part.split('Apparatus:')[1].strip()
                elif 'Movement:' in part:
                    metadata['movement_type'] = part.split('Movement:')[1].strip()
        except Exception as e:
            print(f"Error parsing metadata: {str(e)}")

        return metadata

    def parse_movement(self, movement_row):
        """Parse individual movement information"""
        try:
            # Find the movement title
            title = movement_row.find('a')
            if not title:
                return None

            movement_title = title.text.strip()
            movement_url = title['href']
            print(f"\nFound movement: {movement_title}")

            # Extract instructions
            instructions = []
            instruction_list = movement_row.find('ul')
            if instruction_list:
                instructions = [li.text.strip() for li in instruction_list.find_all('li')]
            else:
                # Check for text in field-item
                field_item = movement_row.find('div', class_='field-item')
                if field_item:
                    text = field_item.get_text(strip=True)
                    if text:
                        instructions = [text]

            # Get metadata
            metadata_div = movement_row.find(lambda tag: tag.name == 'div' and
                                          tag.text and
                                          ('Category:' in tag.text or
                                           'Apparatus:' in tag.text or
                                           'Movement:' in tag.text))

            metadata = self.parse_metadata(metadata_div.text if metadata_div else '')

            # Get video source
            full_url = urljoin(self.wayback_base, movement_url)
            video_src = self.extract_video_source(full_url)

            movement_data = {
                'title': movement_title,
                'instructions': '\n'.join(instructions),
                'movement_url': movement_url,
                'category': metadata['category'],
                'apparatus': metadata['apparatus'],
                'movement_type': metadata['movement_type'],
                'video_src': video_src
            }

            print(f"Successfully parsed movement: {movement_title}")
            return movement_data

        except Exception as e:
            print(f"Error parsing movement: {str(e)}")
            return None

    def scrape_movements(self, max_pages=None, start_page=0):
        """Scrape all movements with improved error handling"""
        page = start_page
        empty_page_count = 0
        consecutive_error_count = 0

        while True:
            if max_pages is not None and page >= start_page + max_pages:
                print(f"Reached maximum number of pages ({max_pages})")
                break

            url = self.base_url
            if page > 0:
                url += f"?page={page}"

            print(f"\nProcessing page {page}")
            content = self.get_page_content(url)

            if not content:
                consecutive_error_count += 1
                if consecutive_error_count >= 3:
                    print("Too many consecutive errors, stopping scrape")
                    break
                page += 1
                continue
            else:
                consecutive_error_count = 0

            soup = BeautifulSoup(content, 'html.parser')

            # Find all movement rows
            movement_rows = soup.find_all('li', class_=lambda x: x and 'views-row' in x)

            print(f"\nFound {len(movement_rows)} potential movement rows")

            if not movement_rows:
                empty_page_count += 1
                print(f"No movements found on page {page} (Empty page count: {empty_page_count})")
                if empty_page_count >= 3:
                    print("Found 3 consecutive empty pages, ending scrape")
                    break
                page += 1
                continue
            else:
                empty_page_count = 0

            movements_added = 0
            for row in movement_rows:
                movement_data = self.parse_movement(row)
                if movement_data:
                    self.movements.append(movement_data)
                    movements_added += 1
                    self.save_progress()

            print(f"Added {movements_added} movements from page {page}")
            print(f"Total movements scraped: {len(self.movements)}")

            page += 1
            delay = random.uniform(15, 20)  # Increased delay between pages
            print(f"Waiting {delay:.1f} seconds before next page...")
            time.sleep(delay)

    def save_progress(self, filename='movements_progress.json'):
        """Save progress to file"""
        with open(filename, 'w') as f:
            json.dump(self.movements, f)
        print(f"Progress saved to {filename}")

    def load_progress(self, filename='movements_progress.json'):
        """Load progress from file"""
        try:
            with open(filename, 'r') as f:
                self.movements = json.load(f)
            print(f"Loaded {len(self.movements)} movements from {filename}")
        except FileNotFoundError:
            print("No progress file found, starting fresh")

    def save_to_csv(self, filename='movements.csv'):
        """Save movements to CSV file"""
        if not self.movements:
            print("No movements to save!")
            return

        df = pd.DataFrame(self.movements)
        df.to_csv(filename, index=False)
        print(f"\nSaved {len(self.movements)} movements to {filename}")

        # Print statistics
        for column in df.columns:
            non_empty = df[column].notna().sum()
            print(f"{column}: {non_empty}/{len(df)} rows have data ({(non_empty/len(df)*100):.1f}%)")

def main():
    scraper = MovementScraper()
    try:
        print("Starting movement library scrape...")
        # Starting fresh - not loading previous progress
        scraper.scrape_movements()
        scraper.save_to_csv()

    except KeyboardInterrupt:
        print("\nScraping interrupted by user!")
        scraper.save_to_csv('movements_partial.csv')
        print("Partial results saved!")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        scraper.save_to_csv('movements_error.csv')
        print("Partial results saved due to error!")

if __name__ == "__main__":
    main()

Starting movement library scrape...

Processing page 0
Fetching: https://web.archive.org/web/20180202184709/http://gymnasticswod.com/library

Found 10 potential movement rows

Found movement: Back Leg Swing
Fetching: https://web.archive.org/web/20180202215701/http://gymnasticswod.com/content/back-leg-swing
Found 4 iframes
Found small video source: https://web.archive.org/web/20180202184709if_/http://gymnasticswod.com/content/s00019?size=small
Successfully parsed movement: Back Leg Swing
Progress saved to movements_progress.json

Found movement: Back Lever
Fetching: https://web.archive.org/web/20180202215701/http://gymnasticswod.com/content/back-lever
Found 7 iframes
Found small video source: https://web.archive.org/web/20171107053739if_/http://gymnasticswod.com/content/a00010?size=small
Successfully parsed movement: Back Lever
Progress saved to movements_progress.json

Found movement: Back Tuck
Fetching: https://web.archive.org/web/20180202215701/http://gymnasticswod.com/content/back-t

In [4]:
## script for the full set of movements

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
from urllib.parse import urljoin

class MovementScraper:
    def __init__(self):
        self.wayback_base = "https://web.archive.org"
        self.base_url = f"{self.wayback_base}/web/20180202184709/http://gymnasticswod.com/library"
        self.movements = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive'
        }
        self.session = requests.Session()
        self.last_request_time = 0
        self.min_request_delay = 15

    def wait_between_requests(self):
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_request_delay:
            wait_time = self.min_request_delay - elapsed + random.uniform(1, 5)
            time.sleep(wait_time)
        self.last_request_time = time.time()

    def get_page_content(self, url, retry_count=5, initial_delay=30):
        if not url.startswith('http'):
            url = urljoin(self.wayback_base, url)

        for attempt in range(retry_count):
            try:
                self.wait_between_requests()
                print(f"Fetching: {url}")
                response = self.session.get(url, headers=self.headers, timeout=30)

                if response.status_code == 404:
                    print("Page not found (404) - reached the end of available pages")
                    return None

                if response.status_code == 429:  # Too Many Requests
                    delay = initial_delay * (attempt + 1)
                    print(f"Rate limited. Waiting {delay} seconds...")
                    time.sleep(delay)
                    continue

                response.raise_for_status()
                if response.status_code == 200:
                    return response.text

            except requests.RequestException as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt + 1 == retry_count:
                    print(f"Failed to fetch {url}")
                    return None

                delay = initial_delay * (2 ** attempt)
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)

        return None

    def extract_video_source(self, movement_url):
        content = self.get_page_content(movement_url)
        if not content:
            return ''

        soup = BeautifulSoup(content, 'html.parser')

        iframes = soup.find_all('iframe')
        print(f"Found {len(iframes)} iframes")

        for iframe in iframes:
            src = iframe.get('src', '')
            if not src or 'donate' in src:  # Skip donation iframe
                continue

            if not src.startswith('http'):
                src = urljoin(self.wayback_base, src)

            if 'size=small' in src:
                print(f"Found small video source: {src}")
                return src
            elif 'size=medium' in src:
                print(f"Found medium video source: {src}")
                return src

        return ''

    def parse_metadata(self, metadata_text):
        metadata = {
            'category': '',
            'apparatus': '',
            'movement_type': ''
        }

        if not metadata_text:
            return metadata

        try:
            parts = metadata_text.split('|')
            for part in parts:
                part = part.strip()
                if 'Category:' in part:
                    metadata['category'] = part.split('Category:')[1].strip()
                elif 'Apparatus:' in part:
                    metadata['apparatus'] = part.split('Apparatus:')[1].strip()
                elif 'Movement:' in part:
                    metadata['movement_type'] = part.split('Movement:')[1].strip()
        except Exception as e:
            print(f"Error parsing metadata: {str(e)}")

        return metadata

    def parse_movement(self, movement_row):
        try:
            # Look for links that are direct children or within movement title containers
            title = movement_row.find(['h2', 'h3'], class_=lambda x: x and 'title' in str(x))
            if title:
                title = title.find('a')
            if not title:
                title = movement_row.find('a', class_=lambda x: x is None or 'category' not in str(x))

            if not title:
                return None

            # Skip category links and non-movement content
            if any(word in title.text.lower() for word in ['category:', 'apparatus:', 'movement:', 'skill', 'strength']):
                return None

            movement_title = title.text.strip()
            movement_url = title['href']
            print(f"\nFound movement: {movement_title}")

            # Extract instructions
            instructions = []
            instruction_list = movement_row.find('ul')
            if instruction_list:
                instructions = [li.text.strip() for li in instruction_list.find_all('li')]
            else:
                # Look for instructions in field-items
                field_items = movement_row.find_all('div', class_='field-item')
                for item in field_items:
                    text = item.get_text(strip=True)
                    if text and not any(keyword in text.lower() for keyword in ['category:', 'apparatus:', 'movement:']):
                        instructions.append(text)

            # Get metadata from specific div
            metadata_div = movement_row.find(lambda tag: tag.name == 'div' and
                                          tag.text and
                                          ('Category:' in tag.text or
                                           'Apparatus:' in tag.text or
                                           'Movement:' in tag.text))

            metadata = self.parse_metadata(metadata_div.text if metadata_div else '')

            # Get video source
            full_url = urljoin(self.wayback_base, movement_url)
            video_src = self.extract_video_source(full_url)

            movement_data = {
                'title': movement_title,
                'instructions': '\n'.join(instructions),
                'movement_url': movement_url,
                'category': metadata['category'],
                'apparatus': metadata['apparatus'],
                'movement_type': metadata['movement_type'],
                'video_src': video_src
            }

            print(f"Successfully parsed movement: {movement_title}")
            return movement_data

        except Exception as e:
            print(f"Error parsing movement: {str(e)}")
            return None

    def scrape_movements(self, max_pages=None, start_page=0):
        page = start_page
        empty_page_count = 0

        while True:
            if max_pages is not None and page >= start_page + max_pages:
                print(f"Reached maximum number of pages ({max_pages})")
                break

            url = self.base_url
            if page > 0:
                url += f"?&&&sort_by=title&sort_order=ASC&items_per_page=10&page={page}"

            print(f"\nProcessing page {page}")
            content = self.get_page_content(url)

            if not content:
                break

            soup = BeautifulSoup(content, 'html.parser')

            # More specific movement detection
            movement_rows = soup.find_all(['div', 'li'],
                class_=lambda x: x and ('node-exercise' in x or 'views-row' in x))

            print(f"\nFound {len(movement_rows)} potential movement rows")

            if not movement_rows:
                empty_page_count += 1
                print(f"No movements found on page {page} (Empty page count: {empty_page_count})")
                if empty_page_count >= 3:
                    print("Found 3 consecutive empty pages, ending scrape")
                    break
                page += 1
                continue
            else:
                empty_page_count = 0

            movements_added = 0
            for row in movement_rows:
                movement_data = self.parse_movement(row)
                if movement_data:
                    self.movements.append(movement_data)
                    movements_added += 1
                    self.save_progress()

            print(f"Added {movements_added} movements from page {page}")
            print(f"Total movements scraped: {len(self.movements)}")

            page += 1
            delay = random.uniform(15, 20)
            print(f"Waiting {delay:.1f} seconds before next page...")
            time.sleep(delay)

    def save_progress(self, filename='movements_progress.json'):
        with open(filename, 'w') as f:
            json.dump(self.movements, f)
        print(f"Progress saved to {filename}")

    def load_progress(self, filename='movements_progress.json'):
        try:
            with open(filename, 'r') as f:
                self.movements = json.load(f)
            print(f"Loaded {len(self.movements)} movements from {filename}")
        except FileNotFoundError:
            print("No progress file found, starting fresh")

    def save_to_csv(self, filename='movements.csv'):
        if not self.movements:
            print("No movements to save!")
            return

        df = pd.DataFrame(self.movements)
        df.to_csv(filename, index=False)
        print(f"\nSaved {len(self.movements)} movements to {filename}")

        for column in df.columns:
            non_empty = df[column].notna().sum()
            print(f"{column}: {non_empty}/{len(df)} rows have data ({(non_empty/len(df)*100):.1f}%)")

def main():
    scraper = MovementScraper()
    try:
        print("Starting movement library scrape...")
        scraper.scrape_movements()
        scraper.save_to_csv()

    except KeyboardInterrupt:
        print("\nScraping interrupted by user!")
        scraper.save_to_csv('movements_partial.csv')
        print("Partial results saved!")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        scraper.save_to_csv('movements_error.csv')
        print("Partial results saved due to error!")

if __name__ == "__main__":
    main()

Starting movement library scrape...

Processing page 0
Fetching: https://web.archive.org/web/20180202184709/http://gymnasticswod.com/library

Found 20 potential movement rows

Found movement: Back Leg Swing
Fetching: https://web.archive.org/web/20180202215701/http://gymnasticswod.com/content/back-leg-swing
Found 4 iframes
Found small video source: https://web.archive.org/web/20180202184709if_/http://gymnasticswod.com/content/s00019?size=small
Successfully parsed movement: Back Leg Swing
Progress saved to movements_progress.json

Found movement: Back Leg Swing
Fetching: https://web.archive.org/web/20180202215701/http://gymnasticswod.com/content/back-leg-swing
Found 4 iframes
Found small video source: https://web.archive.org/web/20180202184709if_/http://gymnasticswod.com/content/s00019?size=small
Successfully parsed movement: Back Leg Swing
Progress saved to movements_progress.json

Found movement: Back Lever
Fetching: https://web.archive.org/web/20180202215701/http://gymnasticswod.com/c

In [5]:
# some data analysis
import pandas as pd
import re

def analyze_csv():
    # Read the CSV file
    df = pd.read_csv('movements.csv')

    print("Data Quality Analysis:")
    print("-" * 50)

    # Basic stats
    print(f"\nTotal movements: {len(df)}")

    # Check categories
    print("\nCategories:")
    print(df['category'].value_counts())

    # Check apparatus
    print("\nApparatus:")
    print(df['apparatus'].value_counts())

    # Check video sources
    print("\nVideo source analysis:")
    video_urls = df['video_src'].dropna()
    print(f"Number of video sources: {len(video_urls)}")

    # Check video URL format
    valid_format = video_urls.str.contains(r'gymnasticswod\.com/content/s\d+\?size=', case=False)
    print(f"Valid URL format: {valid_format.sum()}/{len(video_urls)}")

    # Sample of data
    print("\nSample entries (first 3):")
    for idx, row in df.head(3).iterrows():
        print(f"\nMovement {idx + 1}:")
        print(f"Title: {row['title']}")
        print(f"Category: {row['category']}")
        print(f"Apparatus: {row['apparatus']}")
        print(f"Instructions: {row['instructions'][:100]}...")
        print(f"Video: {row['video_src']}")

if __name__ == "__main__":
    analyze_csv()

Data Quality Analysis:
--------------------------------------------------

Total movements: 344

Categories:
category
Functional         132
Strength           112
Skill               78
SkillStrength       12
FunctionalSkill     10
Name: count, dtype: int64

Apparatus:
apparatus
Floor                   148
Rings                    64
High Bar                 44
P-Bars                   32
Dumbbells                20
Rope                     12
Pommel HorseMushroom      8
Jump Rope                 6
Box                       4
U-BarsHigh Bar            2
Male                      2
Name: count, dtype: int64

Video source analysis:
Number of video sources: 320
Valid URL format: 38/320

Sample entries (first 3):

Movement 1:
Title: Back Leg Swing
Category: Functional
Apparatus: Floor
Instructions: Stand with one leg straight under your hip and hold onto the wall or a low bar for balance
Keep your...
Video: https://web.archive.org/web/20180202184709if_/http://gymnasticswod.com/content/s00

In [None]:
import pandas as pd
import json

# Read the CSV file
df = pd.read_csv('movements.csv')

# Display the columns
print("Columns in the dataset:")
print(df.columns)
print("\nSample movement entry (first row):")
print(df.iloc[0].to_dict())

# Check which columns have data and which are empty
print("\nColumn completeness:")
for column in df.columns:
    non_empty = df[column].notna().sum()
    print(f"{column}: {non_empty}/{len(df)} rows have data ({(non_empty/len(df)*100):.1f}%)")

# Show a few example instructions
print("\nSample movement instructions:")
print(df[['title', 'instructions']].head(3))

Columns in the dataset:
Index(['title', 'instructions', 'movement_url', 'category', 'apparatus',
       'movement_type', 'video_src_small', 'video_src_medium',
       'video_src_large'],
      dtype='object')

Sample movement entry (first row):
{'title': 'Back Leg Swing', 'instructions': 'Stand with one leg straight under your hip and hold onto the wall or a low bar for balance\nKeep your legs straight, belly tight, and butt squeezed as you kick your other leg back and up behind you\nDrive your heel as high as you can while maintaining a neutral spine and straight leg\nYou should be hinging only at the hip, do not bend the legs while swinging and try to point your toe', 'movement_url': '/web/20180204210343/http://gymnasticswod.com/content/back-leg-swing', 'category': 'Functional', 'apparatus': 'Floor', 'movement_type': 'Leg Swing', 'video_src_small': nan, 'video_src_medium': nan, 'video_src_large': nan}

Column completeness:
title: 210/210 rows have data (100.0%)
instructions: 135/210 

In [None]:
pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib



In [6]:
### code used to import to Google sheets
from google.colab import auth
import pandas as pd
from googleapiclient.discovery import build
from google.auth import default
import numpy as np

def clean_data(df):
    """Clean data for Google Sheets export"""
    # Convert NaN to empty strings
    df = df.replace({np.nan: ''})

    # Clean strings - remove problematic characters
    for column in df.columns:
        if df[column].dtype == 'object':  # If column contains strings
            df[column] = df[column].str.replace('\n', ' ')  # Replace newlines with spaces
            df[column] = df[column].str.replace('\r', '')   # Remove carriage returns

    return df

def upload_to_sheets():
    # Authenticate
    auth.authenticate_user()

    # Get credentials
    creds, _ = default()

    # Read and clean the CSV
    df = pd.read_csv('movements.csv')
    df = clean_data(df)

    # Create the Sheets API service
    service = build('sheets', 'v4', credentials=creds)

    # Create new spreadsheet
    spreadsheet = {
        'properties': {
            'title': 'Gymnastics Movements Library'
        }
    }
    spreadsheet = service.spreadsheets().create(body=spreadsheet).execute()
    spreadsheet_id = spreadsheet['spreadsheetId']

    # Prepare the data
    values = [df.columns.tolist()]  # Header row

    # Convert DataFrame to list of lists, ensuring all values are strings
    data_values = df.astype(str).values.tolist()
    values.extend(data_values)

    # Update the spreadsheet
    body = {
        'values': values
    }

    # Calculate the range based on data size
    num_rows = len(values)
    num_cols = len(values[0])
    last_col = chr(ord('A') + num_cols - 1)  # Convert column number to letter
    range_name = f'Sheet1!A1:{last_col}{num_rows}'

    service.spreadsheets().values().update(
        spreadsheetId=spreadsheet_id,
        range=range_name,
        valueInputOption='RAW',
        body=body
    ).execute()

    # Format the spreadsheet
    requests = [
        # Format header row
        {
            "repeatCell": {
                "range": {
                    "sheetId": 0,
                    "startRowIndex": 0,
                    "endRowIndex": 1
                },
                "cell": {
                    "userEnteredFormat": {
                        "backgroundColor": {"red": 0.8, "green": 0.8, "blue": 0.8},
                        "textFormat": {"bold": True}
                    }
                },
                "fields": "userEnteredFormat(backgroundColor,textFormat)"
            }
        },
        # Freeze header row
        {
            "updateSheetProperties": {
                "properties": {
                    "sheetId": 0,
                    "gridProperties": {
                        "frozenRowCount": 1
                    }
                },
                "fields": "gridProperties.frozenRowCount"
            }
        }
    ]

    service.spreadsheets().batchUpdate(
        spreadsheetId=spreadsheet_id,
        body={"requests": requests}
    ).execute()

    print(f"\nSpreadsheet created successfully!")
    print(f"URL: https://docs.google.com/spreadsheets/d/{spreadsheet_id}")
    return spreadsheet_id

# Run the function
try:
    spreadsheet_id = upload_to_sheets()
except Exception as e:
    print(f"Error: {str(e)}")
    # Load and display the first few rows of data for debugging
    df = pd.read_csv('movements.csv')
    print("\nFirst few rows of data:")
    print(df.head())


Spreadsheet created successfully!
URL: https://docs.google.com/spreadsheets/d/1--n2cwFA77sVk45rZ8l6lPHjvcfaewV7FkQp_ym44jk


In [None]:
import pandas as pd
import numpy as np

def analyze_movement_data():
    # Read the CSV file
    df = pd.read_csv('movements.csv')

    # Basic cleanup
    # Replace NaN with empty strings for text fields
    text_columns = ['title', 'instructions', 'movement_url', 'category', 'apparatus', 'movement_type']
    df[text_columns] = df[text_columns].fillna('')

    # Create separate dataframes for different views

    # 1. Main movement data
    movement_info = df[['title', 'category', 'apparatus', 'movement_type', 'instructions']]
    movement_info.to_csv('movement_info.csv', index=False)

    # 2. Video sources
    video_sources = df[['title', 'video_src_small', 'video_src_medium', 'video_src_large']]
    # Add a column indicating if any video source is available
    video_sources['has_video'] = video_sources[['video_src_small', 'video_src_medium', 'video_src_large']].notna().any(axis=1)
    video_sources.to_csv('video_sources.csv', index=False)

    # Print analysis
    print("\nData Analysis:")
    print("-" * 50)
    print("\nCategories:")
    print(df['category'].value_counts())

    print("\nApparatus:")
    print(df['apparatus'].value_counts())

    print("\nMovements with videos:")
    print(f"Total movements: {len(df)}")
    print(f"Movements with any video: {video_sources['has_video'].sum()}")
    print(f"Coverage: {(video_sources['has_video'].sum() / len(df) * 100):.1f}%")

    # Look for patterns in missing videos
    missing_videos = df[~video_sources['has_video']]
    print("\nCategories of movements missing videos:")
    print(missing_videos['category'].value_counts())

    # Export Google Sheets ready version
    # Combine most relevant fields
    sheets_df = df[[
        'title',
        'category',
        'apparatus',
        'movement_type',
        'instructions',
        'video_src_small',  # Include all video sources so you can choose which to use
        'video_src_medium',
        'video_src_large'
    ]].copy()

    # Sort by category and title for better organization
    sheets_df = sheets_df.sort_values(['category', 'title'])

    # Export to CSV
    sheets_df.to_csv('movements_for_sheets.csv', index=False)

    print("\nFiles created:")
    print("1. movement_info.csv - Basic movement information")
    print("2. video_sources.csv - Video source URLs")
    print("3. movements_for_sheets.csv - Formatted for Google Sheets import")

if __name__ == "__main__":
    analyze_movement_data()


Data Analysis:
--------------------------------------------------

Categories:
category
Functional            112
Skill                  70
Strength               37
FunctionalSkill        10
FunctionalStrength      1
Name: count, dtype: int64

Apparatus:
apparatus
Floor                   109
High Bar                 37
Rings                    33
P-Bars                   23
Box                      16
Dumbbells                 6
Pommel HorseMushroom      3
Jump Rope                 2
U-BarsHigh Bar            1
Name: count, dtype: int64

Movements with videos:
Total movements: 230
Movements with any video: 20
Coverage: 8.7%

Categories of movements missing videos:
category
Functional            101
Skill                  64
Strength               35
FunctionalSkill         9
FunctionalStrength      1
Name: count, dtype: int64

Files created:
1. movement_info.csv - Basic movement information
2. video_sources.csv - Video source URLs
3. movements_for_sheets.csv - Formatted for Google Sh

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  video_sources['has_video'] = video_sources[['video_src_small', 'video_src_medium', 'video_src_large']].notna().any(axis=1)


# Progressions

In [3]:
# script for progressions

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
import re
from urllib.parse import urljoin

class ProgressionScraper:
    def __init__(self):
        self.wayback_base = "https://web.archive.org"
        self.base_url = f"{self.wayback_base}/web/20130817171412/http://gymnasticswod.com/progressions"
        self.progressions = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive'
        }
        self.session = requests.Session()
        self.last_request_time = 0
        self.min_request_delay = 10  # Increased for full scrape
        self.processed_urls = set()

    def clean_text(self, text):
        """Clean text by removing navigation and pagination content"""
        # Remove common navigation patterns
        text = text.replace('Pages', '')
        text = text.replace('next ›', '')
        text = text.replace('last »', '')
        text = text.replace('« first', '')
        text = text.replace('‹ previous', '')
        text = text.replace('> view more progressions', '')

        # Remove page numbers
        text = re.sub(r'\d+\.\.\.', '', text)
        text = re.sub(r'\d+', '', text)

        # Clean up multiple spaces and line breaks
        text = ' '.join(text.split())
        return text

    def wait_between_requests(self):
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_request_delay:
            wait_time = self.min_request_delay - elapsed + random.uniform(2, 5)
            time.sleep(wait_time)
        self.last_request_time = time.time()

    def get_page_content(self, url, retry_count=5, initial_delay=30):  # Increased retries and delay
        if not url.startswith('http'):
            url = urljoin(self.wayback_base, url)
        for attempt in range(retry_count):
            try:
                self.wait_between_requests()
                print(f"Fetching: {url}")
                response = self.session.get(url, headers=self.headers, timeout=30)
                if response.status_code == 404:
                    print("Page not found (404)")
                    return None
                if response.status_code == 429:
                    delay = initial_delay * (attempt + 1)
                    print(f"Rate limited. Waiting {delay} seconds...")
                    time.sleep(delay)
                    continue
                response.raise_for_status()
                if response.status_code == 200:
                    return response.text
            except requests.RequestException as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt + 1 == retry_count:
                    print(f"Failed to fetch {url}")
                    return None
                delay = initial_delay * (2 ** attempt)
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
        return None

    def parse_progression_rows(self, soup):
        progression_rows = soup.find_all('article', class_='node-progression')
        if not progression_rows:
            progression_rows = soup.find_all('div', class_=lambda x: x and 'post' in str(x).lower())
            if not progression_rows:
                progression_rows = soup.find_all('div', class_='art-post')
        return progression_rows

    def extract_video_source(self, progression_url):
        content = self.get_page_content(progression_url)
        if not content:
            return ''
        soup = BeautifulSoup(content, 'html.parser')
        iframes = soup.find_all('iframe')
        print(f"Found {len(iframes)} iframes")
        for iframe in iframes:
            src = iframe.get('src', '')
            if not src or 'donate' in src:
                continue
            if not src.startswith('http'):
                src = urljoin(self.wayback_base, src)
            if 'size=small' in src:
                print(f"Found small video source: {src}")
                return src
            elif 'size=medium' in src:
                print(f"Found medium video source: {src}")
                return src
        return ''

    def parse_metadata(self, text):
        """Enhanced metadata parser with better text cleaning"""
        metadata = {
            'category': 'skill',
            'type': '',
            'apparatus': '',
            'movement': ''
        }

        try:
            text = self.clean_text(text)

            if '|' in text:
                parts = [p.strip() for p in text.split('|')]
                for part in parts:
                    for key in ['Category:', 'Type:', 'Apparatus:', 'Movement:']:
                        if key.lower() in part.lower():
                            value = part.split(key)[1].strip()
                            value = self.clean_text(value)
                            value = re.sub(r'(?<!^)(?=[A-Z])', ' ', value).strip()
                            metadata_key = key.lower().rstrip(':')
                            metadata[metadata_key] = value

            if metadata['category']:
                category = metadata['category'].lower().replace('progressions', '').strip()
                if category in {'skill', 'strength', 'functional'}:
                    metadata['category'] = category
                else:
                    text_lower = text.lower()
                    if any(word in text_lower for word in ['strength', 'power', 'conditioning']):
                        metadata['category'] = 'strength'
                    elif any(word in text_lower for word in ['skill', 'technique']):
                        metadata['category'] = 'skill'
                    elif any(word in text_lower for word in ['functional', 'movement', 'mobility']):
                        metadata['category'] = 'functional'
                    else:
                        metadata['category'] = 'skill'

        except Exception as e:
            print(f"Error parsing metadata text: {text}")
            print(f"Error: {str(e)}")

        return metadata

    def parse_progression(self, progression_row):
        try:
            title_h2 = progression_row.find(['h1', 'h2', 'h3'],
                class_=lambda x: x and ('postheader' in str(x).lower() or 'title' in str(x).lower()))

            if title_h2:
                title_link = title_h2.find('a')
            else:
                title_link = progression_row.find('a', class_=lambda x: x and 'title' in str(x).lower())

            if not title_link:
                return None

            progression_title = title_link.text.strip()
            progression_url = title_link['href']

            if progression_url in self.processed_urls:
                print(f"Skipping duplicate progression: {progression_title}")
                return None

            self.processed_urls.add(progression_url)
            print(f"\nFound progression: {progression_title}")

            content_div = progression_row.find('div', class_=lambda x: x and
                ('postcontent' in str(x).lower() or 'content' in str(x).lower()))

            metadata = {'category': 'skill', 'type': '', 'apparatus': '', 'movement': ''}
            instructions = []

            if content_div:
                metadata_p = content_div.find('p', text=lambda x: x and ('Category:' in x or 'Type:' in x))
                if metadata_p:
                    metadata = self.parse_metadata(metadata_p.get_text())
                    metadata_p.decompose()

                for p in content_div.find_all(['p']):
                    text = self.clean_text(p.get_text(strip=True))
                    if text and not any(key in text for key in ['Category:', 'Type:', 'Apparatus:', 'Movement:']):
                        instructions.append(text)

            full_url = urljoin(self.wayback_base, progression_url)
            video_src = self.extract_video_source(full_url)

            print(f"Extracted metadata: {metadata}")
            print(f"Instructions length: {len(instructions)}")

            progression_data = {
                'title': progression_title,
                'instructions': '\n'.join(instructions),
                'movement_url': progression_url,
                'category': metadata['category'],
                'type': metadata['type'],
                'apparatus': metadata['apparatus'],
                'movement': metadata['movement'],
                'video_src': video_src
            }

            print(f"Progression category: {progression_data['category']}")
            return progression_data

        except Exception as e:
            print(f"Error parsing progression: {str(e)}")
            return None

    def scrape_progressions(self, max_pages=None, start_page=0):  # Changed max_pages to None for full scrape
        page = start_page
        wanted_categories = {'strength', 'skill', 'functional'}

        while True:
            if max_pages is not None and page >= start_page + max_pages:
                print(f"Reached maximum number of pages ({max_pages})")
                break

            url = self.base_url
            if page > 0:
                url += f"?&&&sort_by=title&sort_order=ASC&items_per_page=10&page={page}"

            print(f"\nProcessing progressions page {page}")
            content = self.get_page_content(url)

            if not content:
                break

            soup = BeautifulSoup(content, 'html.parser')
            progression_rows = self.parse_progression_rows(soup)

            print(f"\nFound {len(progression_rows)} potential progression rows")

            if not progression_rows:
                print(f"No progressions found on page {page}")
                break

            progressions_added = 0
            for row in progression_rows:
                progression_data = self.parse_progression(row)
                if progression_data:
                    categories = [cat.strip().lower() for cat in progression_data['category'].split(',')]
                    if any(wanted in categories for wanted in wanted_categories):
                        self.progressions.append(progression_data)
                        progressions_added += 1
                        print(f"Added progression: {progression_data['title']}")
                    else:
                        print(f"Skipping progression with categories: {categories}")

            self.save_progress()
            print(f"Added {progressions_added} progressions from page {page}")
            print(f"Total items scraped: {len(self.progressions)}")

            page += 1
            delay = random.uniform(10, 15)  # Increased delay
            print(f"Waiting {delay:.1f} seconds before next page...")
            time.sleep(delay)

    def save_progress(self, filename='progressions_progress.json'):
        with open(filename, 'w') as f:
            json.dump(self.progressions, f)
        print(f"Progress saved to {filename}")

    def load_progress(self, filename='progressions_progress.json'):
        try:
            with open(filename, 'r') as f:
                self.progressions = json.load(f)
            print(f"Loaded {len(self.progressions)} progressions from {filename}")
        except FileNotFoundError:
            print("No progress file found, starting fresh")

    def save_to_csv(self, filename='progressions.csv'):
        if not self.progressions:
            print("No progressions to save!")
            return
        df = pd.DataFrame(self.progressions)
        df.to_csv(filename, index=False)
        print(f"\nSaved {len(self.progressions)} progressions to {filename}")
        for column in df.columns:
            non_empty = df[column].notna().sum()
            print(f"{column}: {non_empty}/{len(df)} rows have data ({(non_empty/len(df)*100):.1f}%)")

def main():
    scraper = ProgressionScraper()
    try:
        print("Starting progressions scrape...")
        # Load any existing progress
        scraper.load_progress()
        # Run full scrape with no page limit
        scraper.scrape_progressions(max_pages=None)
        scraper.save_to_csv()
    except KeyboardInterrupt:
        print("\nScraping interrupted by user!")
        scraper.save_to_csv('progressions_partial.csv')
        print("Partial results saved!")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        scraper.save_to_csv('progressions_error.csv')
        print("Partial results saved due to error!")

if __name__ == "__main__":
    main()

Starting progressions scrape...
No progress file found, starting fresh

Processing progressions page 0
Fetching: https://web.archive.org/web/20130817171412/http://gymnasticswod.com/progressions

Found 153 potential progression rows

Found progression: Air Squat Progression Pt.1


  metadata_p = content_div.find('p', text=lambda x: x and ('Category:' in x or 'Type:' in x))


Fetching: https://web.archive.org/web/20130825070155/http://gymnasticswod.com/content/air-squat-progression-pt1
Found 4 iframes
Found small video source: https://web.archive.org/web/20130824210306if_/http://gymnasticswod.com/content/SQ00022?size=small
Extracted metadata: {'category': 'skill', 'type': '', 'apparatus': '', 'movement': ''}
Instructions length: 14
Progression category: skill
Added progression: Air Squat Progression Pt.1
Skipping duplicate progression: Air Squat Progression Pt.1
Skipping duplicate progression: Air Squat Progression Pt.1
Skipping duplicate progression: Air Squat Progression Pt.1
Skipping duplicate progression: Air Squat Progression Pt.1
Skipping duplicate progression: Air Squat Progression Pt.1
Skipping duplicate progression: Air Squat Progression Pt.1

Found progression: Air Squat Progression Pt.2
Fetching: https://web.archive.org/web/20130825070155/http://gymnasticswod.com/content/air-squat-progression-pt2
Found 4 iframes
Found small video source: https://

In [4]:
import pandas as pd

# Load the CSV
df = pd.read_csv('progressions.csv')

# Basic info about the dataset
print(f"Total number of progressions: {len(df)}")
print("\nColumns in the dataset:")
for column in df.columns:
    print(f"{column}: {df[column].notna().sum()} entries")

# Check categories distribution
print("\nCategory distribution:")
print(df['category'].value_counts())

# Check for duplicates
duplicates = df[df.duplicated(subset=['title'])]
print(f"\nNumber of duplicate titles: {len(duplicates)}")

Total number of progressions: 170

Columns in the dataset:
title: 170 entries
instructions: 170 entries
movement_url: 170 entries
category: 170 entries
type: 0 entries
apparatus: 0 entries
movement: 0 entries
video_src: 170 entries

Category distribution:
category
skill    170
Name: count, dtype: int64

Number of duplicate titles: 10


In [6]:
# Look at first few entries
print("\nFirst few entries:")
print(df[['title', 'category', 'video_src']].head())


First few entries:
                                  title category  \
0            Air Squat Progression Pt.1    skill   
1            Air Squat Progression Pt.2    skill   
2            Air Squat Progression Pt.3    skill   
3  Back Extension Roll Progression Pt.1    skill   
4  Back Extension Roll Progression Pt.2    skill   

                                           video_src  
0  https://web.archive.org/web/20130824210306if_/...  
1  https://web.archive.org/web/20130909034659if_/...  
2  https://web.archive.org/web/20130812145357if_/...  
3  https://web.archive.org/web/20130813070540if_/...  
4  https://web.archive.org/web/20130817205839if_/...  


In [19]:
# exporting progressions to GSheets
from google.colab import drive, auth
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from google.oauth2 import service_account
from google.colab import auth
from oauth2client.client import GoogleCredentials
import gspread
import pandas as pd
from google.auth import default

def upload_to_sheets():
    print("Setting up Google Sheets access...")
    drive.mount('/content/drive', force_remount=True)

    auth.authenticate_user()
    creds, _ = default()
    gc = gspread.authorize(creds)

    print("Reading and cleaning progressions data...")
    df = pd.read_csv('progressions.csv')

    # Create series and part information
    df['series'] = df['title'].apply(lambda x: x.split('Pt.')[0].strip() if 'Pt.' in x else x)
    df['part'] = df['title'].apply(lambda x: f"Part {x.split('Pt.')[1]}" if 'Pt.' in x else '')

    # Clean NaN values
    df['category'] = df['category'].fillna('Unspecified')
    df['type'] = df['type'].fillna('')
    df['apparatus'] = df['apparatus'].fillna('')
    df['movement'] = df['movement'].fillna('')
    df['instructions'] = df['instructions'].fillna('')

    print("\nCreating main spreadsheet...")
    workbook = gc.create('Gymnastics Progressions Database')

    print("Creating Overview sheet...")
    overview = workbook.sheet1
    overview.update_title('Overview')

    overview_data = [
        ['Gymnastics Progressions Database'],
        [''],
        ['Total Progressions:', str(len(df))],
        ['Total Unique Series:', str(len(df['series'].unique()))],
        [''],
        ['Progressions by Category'],
        ['Category', 'Count']
    ]

    category_counts = df['category'].value_counts()
    for category, count in category_counts.items():
        overview_data.append([category, str(count)])

    overview_data.extend([
        [''],
        ['Series with Multiple Parts'],
        ['Series', 'Number of Parts']
    ])

    series_counts = df['series'].value_counts()
    multi_part_series = series_counts[series_counts > 1]
    for series, count in multi_part_series.items():
        overview_data.append([series, str(count)])

    print("Updating Overview sheet...")
    overview.update('A1', overview_data)

    print("\nCreating All Progressions sheet...")
    all_progressions = workbook.add_worksheet(title='All Progressions', rows=len(df)+1, cols=9)

    # Prepare headers and data
    headers = ['Series', 'Part', 'Category', 'Type', 'Apparatus', 'Movement', 'Instructions', 'Video Link', 'Movement Link']

    # Create hyperlinks for videos and movement URLs
    df['video_link'] = df['video_src'].apply(lambda x: f'{x}' if pd.notna(x) and x != '' else '')
    df['movement_link'] = df['movement_url'].apply(lambda x: f'{x}, "View Details")' if pd.notna(x) and x != '' else '')

    # Convert DataFrame to list of lists
    all_data = [headers]
    all_data.extend(df[['series', 'part', 'category', 'type', 'apparatus', 'movement', 'instructions', 'video_link', 'movement_link']].values.tolist())

    print(f"Uploading data to All Progressions sheet...")
    all_progressions.update('A1', all_data)

    # Create category sheets
    for category in df['category'].unique():
        print(f"\nCreating sheet for {category}...")
        category_df = df[df['category'] == category]

        sheet_name = category.replace(' ', '_')[:100]  # Sheet names have character limits
        category_sheet = workbook.add_worksheet(title=sheet_name, rows=len(category_df)+1, cols=8)

        # Prepare category data
        category_data = [['Series', 'Part', 'Type', 'Apparatus', 'Movement', 'Instructions', 'Video', 'Details']]
        category_data.extend(category_df[['series', 'part', 'type', 'apparatus', 'movement', 'instructions', 'video_link', 'movement_link']].values.tolist())

        print(f"Uploading data to {category} sheet...")
        category_sheet.update('A1', category_data)

    # Format the sheets
    print("\nApplying formatting...")
    for worksheet in workbook.worksheets():
        worksheet.freeze(1, 0)  # Freeze first row
        worksheet.format('A1:Z1', {
            "backgroundColor": {"red": 0.9, "green": 0.9, "blue": 0.9},
            "textFormat": {"bold": True}
        })

    print("\nSpreadsheet creation complete!")
    print(f"Spreadsheet URL: {workbook.url}")

    # Share the spreadsheet
    workbook.share(None, perm_type='anyone', role='reader')

    return workbook.url

try:
    print("Starting Google Sheets upload...")
    spreadsheet_url = upload_to_sheets()
    print(f"\nYour spreadsheet is available at: {spreadsheet_url}")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    traceback.print_exc()

Starting Google Sheets upload...
Setting up Google Sheets access...
Mounted at /content/drive
Reading and cleaning progressions data...

Creating main spreadsheet...
Creating Overview sheet...
Updating Overview sheet...

Creating All Progressions sheet...


  overview.update('A1', overview_data)


Uploading data to All Progressions sheet...


  all_progressions.update('A1', all_data)



Creating sheet for skill...
Uploading data to skill sheet...


  category_sheet.update('A1', category_data)



Applying formatting...

Spreadsheet creation complete!
Spreadsheet URL: https://docs.google.com/spreadsheets/d/1x6mLJEgQmwALqUk9hDmbvAfddK7DF2DjKf_TgcHWVy0

Your spreadsheet is available at: https://docs.google.com/spreadsheets/d/1x6mLJEgQmwALqUk9hDmbvAfddK7DF2DjKf_TgcHWVy0
