# Assignment 2 — Data Version Control with DVC

**Goals (this notebook):**
1. Create `raw_data.csv` from the original dataset and track it with DVC.
2. Create an initial split (v1) and version it with DVC+Git.
3. Create an updated split (v2) using a different random seed and version it.
4. Checkout v1 and print target distributions (train/validation/test).
5. Checkout v2 and print target distributions (train/validation/test).

**Safety note:** This notebook runs `git` and `dvc` commands. If your working tree has uncommitted changes, stash or commit them first. The notebook will try to stash automatically but always keep backups.

In [1]:
# install dvc and helpers if not present.

# !pip install --upgrade pip
# !pip install dvc

# !pip install "dvc[gdrive]"

import sys
print("Python:", sys.version.splitlines()[0])

Python: 3.14.2 (tags/v3.14.2:df79316, Dec  5 2025, 17:18:21) [MSC v.1944 64 bit (AMD64)]


In [2]:
import os, subprocess, shlex, json, time
from pathlib import Path

# ------- Configure project directory -------
# Set PROJECT_DIR to the root of your repository where .git/.dvc will live.
PROJECT_DIR = '.'   
os.makedirs(PROJECT_DIR, exist_ok=True)

RAW_SRC = os.path.join(PROJECT_DIR, r'sms+spam+collection\SMSSpamCollection')  
RAW_CSV = os.path.join(PROJECT_DIR, 'raw_data.csv')
TRAIN_CSV = os.path.join(PROJECT_DIR, 'train.csv')
VAL_CSV = os.path.join(PROJECT_DIR, 'validation.csv')
TEST_CSV = os.path.join(PROJECT_DIR, 'test.csv')

print("PROJECT_DIR:", PROJECT_DIR)
print("RAW_SRC:", RAW_SRC)
print("RAW_CSV:", RAW_CSV)

# Helper to run shell commands and print output
def sh(cmd, cwd=PROJECT_DIR, check=False):
    print("$", cmd)
    res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=cwd)
    print(res.stdout)
    if check and res.returncode != 0:
        raise RuntimeError(f"Command failed: {cmd}")
    return res

PROJECT_DIR: .
RAW_SRC: .\sms+spam+collection\SMSSpamCollection
RAW_CSV: .\raw_data.csv


In [3]:
# Create raw_data.csv from provided SMSSpamCollection (tab-separated)
import pandas as pd, os

if not os.path.exists(RAW_SRC):
    raise FileNotFoundError(f"Original dataset not found at {RAW_SRC}. Put the dataset file here or change RAW_SRC.")

df_raw = pd.read_csv(RAW_SRC, sep='\t', header=None, names=['label','message'], quoting=3, engine='python')
print("Rows loaded:", len(df_raw))
df_raw.to_csv(RAW_CSV, index=False)
print("Saved raw_data.csv ->", RAW_CSV)
display(df_raw.head())

Rows loaded: 5574
Saved raw_data.csv -> .\raw_data.csv


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Create raw_data.csv from provided SMSSpamCollection (tab-separated)
import pandas as pd, os

if not os.path.exists(RAW_SRC):
    raise FileNotFoundError(f"Original dataset not found at {RAW_SRC}. Put the dataset file here or change RAW_SRC.")

df_raw = pd.read_csv(RAW_SRC, sep='\t', header=None, names=['label','message'], quoting=3, engine='python')
print("Rows loaded:", len(df_raw))
df_raw.to_csv(RAW_CSV, index=False)
print("Saved raw_data.csv ->", RAW_CSV)
display(df_raw.head())

Rows loaded: 5574
Saved raw_data.csv -> .\raw_data.csv


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Create initial stratified splits (70/15/15) using seed V1 and dvc-add them
from sklearn.model_selection import train_test_split
import pandas as pd

RANDOM_STATE_V1 = 42
df = pd.read_csv(RAW_CSV)

train_val, test = train_test_split(df, test_size=0.15, random_state=RANDOM_STATE_V1, stratify=df['label'])
val_size_of_trainval = 0.15 / (0.7 + 0.15)
train, val = train_test_split(train_val, test_size=val_size_of_trainval, random_state=RANDOM_STATE_V1, stratify=train_val['label'])

# Save CSVs (overwrite if exist)
train.to_csv(TRAIN_CSV, index=False)
val.to_csv(VAL_CSV, index=False)
test.to_csv(TEST_CSV, index=False)
print("Saved v1 splits to:", TRAIN_CSV, VAL_CSV, TEST_CSV)
print("Train shape:", train.shape, "Val shape:", val.shape, "Test shape:", test.shape)

# Add to DVC and commit metadata to git
sh(f'dvc add "{TRAIN_CSV}"', check=True)
sh(f'dvc add "{VAL_CSV}"', check=True)
sh(f'dvc add "{TEST_CSV}"', check=True)

sh('git add train.csv.dvc validation.csv.dvc test.csv.dvc || true')
sh(f'git commit -m "DVC: add splits v1 (seed={RANDOM_STATE_V1})" || true')

v1_splits_commit = sh('git rev-parse HEAD').stdout.strip().splitlines()[-1]
with open(os.path.join(PROJECT_DIR,'dvc_v1_splits_commit.txt'),'w') as f:
    f.write(v1_splits_commit)
print("Saved v1 splits commit:", v1_splits_commit)

Saved v1 splits to: .\train.csv .\validation.csv .\test.csv
Train shape: (3901, 2) Val shape: (836, 2) Test shape: (837, 2)
$ dvc add ".\train.csv"
â ‹ Checking graph


To track the changes with git, run:

	git add .gitignore train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

$ dvc add ".\validation.csv"
â ‹ Checking graph


To track the changes with git, run:

	git add validation.csv.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true

$ dvc add ".\test.csv"
â ‹ Checking graph


To track the changes with git, run:

	git add .gitignore test.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

$ git add train.csv.dvc validation.csv.dvc test.csv.dvc || true

$ git commit -m "DVC: add splits v1 (seed=42)" || true
[detached HEAD 21839cf] DVC: add splits v1 (seed=42)
 9 files changed, 18 insertions(+), 6 deletions(-)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 crea

In [6]:
# Create v2 split with a different seed and commit
RANDOM_STATE_V2 = 12345
df = pd.read_csv(RAW_CSV)

train_val2, test2 = train_test_split(df, test_size=0.15, random_state=RANDOM_STATE_V2, stratify=df['label'])
val_size_of_trainval = 0.15 / (0.7 + 0.15)
train2, val2 = train_test_split(train_val2, test_size=val_size_of_trainval, random_state=RANDOM_STATE_V2, stratify=train_val2['label'])

# Overwrite the same filenames (DVC will track changes)
train2.to_csv(TRAIN_CSV, index=False)
val2.to_csv(VAL_CSV, index=False)
test2.to_csv(TEST_CSV, index=False)
print("Saved v2 splits (overwriting previous CSVs) using seed", RANDOM_STATE_V2)

# dvc add + git commit the changes
sh(f'dvc add "{TRAIN_CSV}"', check=True)
sh(f'dvc add "{VAL_CSV}"', check=True)
sh(f'dvc add "{TEST_CSV}"', check=True)

sh('git add train.csv.dvc validation.csv.dvc test.csv.dvc || true')
sh(f'git commit -m "DVC: update splits v2 (seed={RANDOM_STATE_V2})" || true')

v2_splits_commit = sh('git rev-parse HEAD').stdout.strip().splitlines()[-1]
with open(os.path.join(PROJECT_DIR,'dvc_v2_splits_commit.txt'),'w') as f:
    f.write(v2_splits_commit)
print("Saved v2 splits commit:", v2_splits_commit)

Saved v2 splits (overwriting previous CSVs) using seed 12345
$ dvc add ".\train.csv"
â ‹ Checking graph


To track the changes with git, run:

	git add train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

$ dvc add ".\validation.csv"
â ‹ Checking graph


To track the changes with git, run:

	git add validation.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

$ dvc add ".\test.csv"
â ‹ Checking graph


To track the changes with git, run:

	git add test.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

$ git add train.csv.dvc validation.csv.dvc test.csv.dvc || true

$ git commit -m "DVC: update splits v2 (seed=12345)" || true
[detached HEAD e53403e] DVC: update splits v2 (seed=12345)
 3 files changed, 6 insertions(+), 6 deletions(-)

$ git rev-parse HEAD
e53403e1985f16c851155fe4a2522684bd2fa57a

Saved v2 splits commit: e53403e1985f16c851155fe4a2522684bd2fa57a


In [None]:
# Helper that safely checks out a commit, runs `dvc checkout`, and returns True/False.
# It will stash changes first if any, and pop stash afterwards.
def safe_git_checkout(commit):
    # check for working tree changes
    status = sh('git status --porcelain')
    created_stash = False
    if status.stdout.strip():
        print("Uncommitted changes detected; stashing them before checkout.")
        stash_res = sh('git stash push -u -m "auto-stash-before-notebook-checkout"')
        created_stash = True
    # checkout commit
    co = sh(f'git checkout {commit} --quiet || true')
    # dvc checkout to sync files
    dvc_co = sh('dvc checkout -q || true')
    if created_stash:
        print("Restoring stash...")
        pop = sh('git stash pop || true')
    return True



In [None]:
# Checkout v1 splits and print target distributions
with open(os.path.join(PROJECT_DIR,'dvc_v1_splits_commit.txt')) as f:
    v1_commit = f.read().strip()
print("Checking out v1 splits commit:", v1_commit)

safe_git_checkout(v1_commit)

# Load and print distributions
import pandas as pd
train_v1 = pd.read_csv(TRAIN_CSV)
val_v1 = pd.read_csv(VAL_CSV)
test_v1 = pd.read_csv(TEST_CSV)

print("Distributions (v1):")
print("train:", train_v1['label'].value_counts().to_dict())
print("validation:", val_v1['label'].value_counts().to_dict())
print("test:", test_v1['label'].value_counts().to_dict())

Checking out v1 splits commit: 21839cf562a4545900ab72574532fd6805867ac9
$ git status --porcelain
?? Assignment02/.gitignore
?? Assignment02/dvc_v1_splits_commit.txt
?? Assignment02/dvc_v2_splits_commit.txt
?? Assignment02/prepare.ipynb
?? Assignment02/raw_data.csv
?? Assignment02/sms+spam+collection/
?? Assignment02/train.ipynb

Uncommitted changes detected; stashing them before checkout.
$ git stash push -u -m "auto-stash-before-notebook-checkout"
Saved working directory and index state On (no branch): auto-stash-before-notebook-checkout

$ git checkout 21839cf562a4545900ab72574532fd6805867ac9 --quiet || true

$ dvc checkout -q || true

Restoring stash...
$ git stash pop || true
Already up to date.
HEAD detached at 21839cf
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.gitignore
	dvc_v1_splits_commit.txt
	dvc_v2_splits_commit.txt
	prepare.ipynb
	raw_data.csv
	sms+spam+collection/
	train.ipynb

nothing added to commit but untracked files present (us

In [9]:
# Checkout v2 splits and print target distributions
with open(os.path.join(PROJECT_DIR,'dvc_v2_splits_commit.txt')) as f:
    v2_commit = f.read().strip()
print("Checking out v2 splits commit:", v2_commit)
safe_git_checkout(v2_commit)

train_v2 = pd.read_csv(TRAIN_CSV)
val_v2 = pd.read_csv(VAL_CSV)
test_v2 = pd.read_csv(TEST_CSV)

print("Distributions (v2):")
print("train:", train_v2['label'].value_counts().to_dict())
print("validation:", val_v2['label'].value_counts().to_dict())
print("test:", test_v2['label'].value_counts().to_dict())

Checking out v2 splits commit: e53403e1985f16c851155fe4a2522684bd2fa57a
$ git status --porcelain
?? Assignment02/.gitignore
?? Assignment02/dvc_v1_splits_commit.txt
?? Assignment02/dvc_v2_splits_commit.txt
?? Assignment02/prepare.ipynb
?? Assignment02/raw_data.csv
?? Assignment02/sms+spam+collection/
?? Assignment02/train.ipynb

Uncommitted changes detected; stashing them before checkout.
$ git stash push -u -m "auto-stash-before-notebook-checkout"
Saved working directory and index state On (no branch): auto-stash-before-notebook-checkout

$ git checkout e53403e1985f16c851155fe4a2522684bd2fa57a --quiet || true

$ dvc checkout -q || true

Restoring stash...
$ git stash pop || true
Already up to date.
HEAD detached at e53403e
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.gitignore
	dvc_v1_splits_commit.txt
	dvc_v2_splits_commit.txt
	prepare.ipynb
	raw_data.csv
	sms+spam+collection/
	train.ipynb

nothing added to commit but untracked files present (us