In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from deltalake import write_deltalake, DeltaTable
import warnings
import os
import subprocess
import shutil
warnings.filterwarnings('ignore')

In [2]:
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import TensorDataset, DataLoader
    from opacus import PrivacyEngine
    from opacus.utils.batch_memory_manager import BatchMemoryManager
except ImportError:
    print("Installing PyTorch and Opacus.")
    subprocess.run(['pip', 'install', 'torch', 'opacus', '-q'], check=True)
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import TensorDataset, DataLoader
    from opacus import PrivacyEngine
    from opacus.utils.batch_memory_manager import BatchMemoryManager

In [3]:
def run_command(command, description=""):
    if description:
        print(f"\n{'-'*60}")
        print(f"{description}")
        print(f"{'-'*60}")
    print(f"{command}")
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.stdout:
        print(result.stdout)
    if result.returncode != 0 and result.stderr:
        print(f"⚠️  {result.stderr}")
    return result

In [4]:
# total lift and splitting into training and test datasets.
def prepare_dataset(df):
    df = df.copy()
    

    lift_columns = ['deadlift', 'candj', 'snatch', 'backsq']
    existing_lifts = [col for col in lift_columns if col in df.columns]
    
    if existing_lifts:
        df['total_lift'] = df[existing_lifts].sum(axis=1)
    else:
        print("No lift columns found.")
        np.random.seed(42)
        df['total_lift'] = np.random.randint(500, 2000, size=len(df))
    
    df = df.dropna(subset=['total_lift'])
    
    # features.
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'total_lift' in numeric_cols:
        numeric_cols.remove('total_lift')
    
    X = df[numeric_cols].fillna(df[numeric_cols].mean())
    y = df['total_lift']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    return X_train, X_test, y_train, y_test, numeric_cols

In [5]:
if not os.path.exists('.git'):
    print("Git not initialized.")
    exit(1)


if not os.path.exists('.dvc'):
    run_command("dvc init", "Initializing DVC")
    run_command("git add .dvc .dvcignore", "Adding DVC files to Git")
    run_command('git commit -m "Initialize DVC"', "Committing DVC setup")
else:
    print("DVC initialized.")


if not os.path.exists('athletes.csv'):
    print("File not found.")
    exit(1)

DVC initialized.


In [6]:
# DVC workflow.
print("-"*80)
print("DVC workflow.")
print("-"*80)


df_original = pd.read_csv('athletes.csv')
print(f"Original Dataset: {df_original.shape}")
print(f"Columns: {list(df_original.columns)[:5]}...")

--------------------------------------------------------------------------------
DVC workflow.
--------------------------------------------------------------------------------
Original Dataset: (423006, 27)
Columns: ['athlete_id', 'name', 'region', 'team', 'affiliate']...


In [7]:
print("-"*80)
print("1. Work with given machine learning dataset, call this dataset version 1 (v1).")
print("-"*80)

df_v1 = df_original.copy()
df_v1.to_csv('athletes_v1.csv', index=False)

run_command("dvc add athletes_v1.csv", "Adding v1 to DVC")
run_command("git add athletes_v1.csv.dvc .gitignore", "Adding DVC tracking files to Git")
run_command('git commit -m "Add dataset v1 (original)"', "Committing v1")

print("Dataset v1 created and versioned with DVC and Git.")

--------------------------------------------------------------------------------
1. Work with given machine learning dataset, call this dataset version 1 (v1).
--------------------------------------------------------------------------------

------------------------------------------------------------
Adding v1 to DVC
------------------------------------------------------------
dvc add athletes_v1.csv

To track the changes with git, run:

	git add athletes_v1.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


------------------------------------------------------------
Adding DVC tracking files to Git
------------------------------------------------------------
git add athletes_v1.csv.dvc .gitignore

------------------------------------------------------------
Committing v1
------------------------------------------------------------
git commit -m "Add dataset v1 (original)"
On branch main
Untracked files:
  (use "git add <file>..." to include in what will be comm