In [1]:
import sys
import os
sys.path.append(os.path.abspath('../dataversioner'))

# Data Versioner 
## Quick start

In [2]:
from dataversioner import DataVersioner
import pandas as pd

In [3]:
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], 
                  columns = ["a", "b", "c"])
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


Create your data versioner by passing your pandas dataframe into **DataVersioner()**

In [4]:
dv = DataVersioner(df)

Check current commit with **status()**

In [5]:
dv.status()

'Initial df' - First commit of data
Committed at 11:03 PM on Mar 15, 2022

   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9


Review all commits with **commits()**

In [6]:
dv.commits()

['Initial df']

Peform your analysis on using **data**

In [7]:
dv.data['abc sum'] = dv.data.sum(axis=1)
dv.data

Unnamed: 0,a,b,c,abc sum
0,1,2,3,6
1,4,5,6,15
2,7,8,9,24


Checkpoint your progress by adding a commit with **commit()**

In [8]:
dv.commit(name = "Row sum", message = "Added 'sum' of a, b, c")
dv.commits()

['Initial df', 'Row sum']

In [9]:
dv.status()

'Row sum' - Added 'sum' of a, b, c
Committed at 11:03 PM on Mar 15, 2022

   a  b  c  abc sum
0  1  2  3        6
1  4  5  6       15
2  7  8  9       24


View any commit with **show_commit()**

In [10]:
dv.show_commit('Initial df')

'Initial df' - First commit of data
Committed at 11:03 PM on Mar 15, 2022

   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9


View the commit tree structure with **show_commits()**

In [11]:
dv.show_commits()

Initial df
   - Row sum



Check out a previous commit to resume your analysis where you left off with **checkout()** 

In [12]:
dv.checkout('Initial df')
dv.data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


To **checkout()** a commit when data has uncommitted changes the parameter **allow_discard_changes** must be True

In [13]:
dv.data['c * a'] = dv.data['c'] * dv.data['a']
dv.checkout('Initial df', allow_discard_changes = True)

Or **commit()** your progress first to **checkout()** as usual

In [14]:
dv.data['c * a'] = dv.data['c'] * dv.data['a']
dv.commit("c times a", "Added product of c and a")
dv.show_commits()

Initial df
   - Row sum
   - c times a



In [15]:
dv.checkout('Row sum')
dv.data

Unnamed: 0,a,b,c,abc sum
0,1,2,3,6
1,4,5,6,15
2,7,8,9,24


In [16]:
dv.data = pd.concat([dv.data, dv.data * 10], ignore_index=True)
dv.commit('Concat rows * 10', 'Concatenated data multiplied by 10 to data')
dv.data

Unnamed: 0,a,b,c,abc sum
0,1,2,3,6
1,4,5,6,15
2,7,8,9,24
3,10,20,30,60
4,40,50,60,150
5,70,80,90,240


View commit tree with message when parameter **verbose** is True

In [17]:
dv.show_commits(verbose = True)

Initial df                        First commit of data
   - Row sum                      Added 'sum' of a, b, c
        - Concat rows * 10        Concatenated data multiplied by 10 to data
   - c times a                    Added product of c and a

