# 1.0 - Data Versioning - Team 34
German Credit Risk Dataset - Data Versioning

This notebook performs data versioning of the cleaned dataset. 

In [11]:
import subprocess

## Initialize DVC

In [23]:
remote_repo = 'gs://fase2-datalake'

try:
    subprocess.run(['dvc', 'version'], capture_output=True, check=True)
    print("✓ DVC is installed")
    try:
        subprocess.run(['dvc', 'init'], capture_output=True, check=True)
        subprocess.run(['dvc', 'remote', 'add', '-d', 'gcs', remote_repo], capture_output=True, check=True)
        print("✓ DVC is initialized")
    except subprocess.CalledProcessError:
            print("⚠ DVC add failed")
except FileNotFoundError:
    print("⚠ DVC not installed. Install with: pip install dvc")

✓ DVC is installed
✓ DVC is initialized


## Add dataset with DVC and push to remote repo in GCP

In [24]:
# version cleaned dataset
cleaned_path = 'data/interim/german_credit_cleaned.csv'

# Initialize DVC (if not already initialized)
try:
    subprocess.run(['dvc', 'version'], capture_output=True, check=True)
    print("✓ DVC is installed")

    # Add file to DVC
    try:
        # Add dataset
        subprocess.run(['dvc', 'add', cleaned_path], capture_output=True, check=True)
        print(f"✓ DVC tracking enabled for {cleaned_path}")
        # Commit metadata
        print("  → Run: git add data/german_credit_cleaned_v1.csv.dvc data/.gitignore")
        print("  → Run: git commit -m 'v1: cleaned dataset'")
        # Push to GCP
        subprocess.run(['dvc', 'push'], capture_output=True, check=True)
    except subprocess.CalledProcessError:
            print("⚠ DVC add failed (may already be tracked)")
except FileNotFoundError:
    print("⚠ DVC not installed. Install with: pip install dvc")

✓ DVC is installed
✓ DVC tracking enabled for data/interim/german_credit_cleaned.csv
  → Run: git add data/german_credit_cleaned_v1.csv.dvc data/.gitignore
  → Run: git commit -m 'v1: cleaned dataset'


In [30]:
!gcloud storage ls -l gs://fase2-datalake/data/interim/

     76305  2025-10-31T03:00:34Z  gs://fase2-datalake/data/interim/german_credit_cleaned.csv
TOTAL: 1 objects, 76305 bytes (74.52kiB)
