In [1]:
from tsv_data_analytics import tsvutils
from tsv_data_analytics import funclib

In [2]:
# read data. this is the famouse iris dataset
# the read() method should be used in general that takes a tsv of gzip compressed tsv
# x = tsvutils.read("/path/to/tsv-file.tsv.gz")
# or
# x = tsvutils.read("s3://bucket/path/to/tsv-file.tsv.gz")
x = tsvutils.read_url("https://github.com/CrowdStrike/tsv-data-analytics/raw/main/data/iris.tsv")

In [3]:
print(x.num_rows())

150


In [4]:
x.export_to_df(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# basic filtering and selection
y = x \
    .rename("class", "label")

In [6]:
# easy to export to pandas data frame
y.export_to_df(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [7]:
# can also use a built in show() method for console output
y.show(5)

sepal_length	sepal_width	petal_length	petal_width	label          
5.1         	        3.5	         1.4	        0.2	Iris-setosa    
4.9         	        3.0	         1.4	        0.2	Iris-setosa    
4.7         	        3.2	         1.3	        0.2	Iris-setosa    
4.6         	        3.1	         1.5	        0.2	Iris-setosa    
5.0         	        3.6	         1.4	        0.2	Iris-setosa    


<tsv_data_analytics.tsv.TSV at 0x1235ed940>

In [8]:
# select specific columns
y \
    .select(["label", "sepal_width"]) \
    .export_to_df(5)

Unnamed: 0,label,sepal_width
0,Iris-setosa,3.5
1,Iris-setosa,3.0
2,Iris-setosa,3.2
3,Iris-setosa,3.1
4,Iris-setosa,3.6


In [9]:
# apply simple transformation. Use a library of helper methods provided in the funclib package
y \
    .transform(["petal_width"], lambda x: str(funclib.to4digit(float(x) * 0.393701)), "petal_width_inches") \
    .export_to_df(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label,petal_width_inches
0,5.1,3.5,1.4,0.2,Iris-setosa,0.0787
1,4.9,3.0,1.4,0.2,Iris-setosa,0.0787
2,4.7,3.2,1.3,0.2,Iris-setosa,0.0787
3,4.6,3.1,1.5,0.2,Iris-setosa,0.0787
4,5.0,3.6,1.4,0.2,Iris-setosa,0.0787


In [10]:
# apis to apply simple arithmetic operations
y \
    .gt("sepal_length", 5) \
    .export_to_df(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,5.4,3.9,1.7,0.4,Iris-setosa
2,5.4,3.7,1.5,0.2,Iris-setosa
3,5.8,4.0,1.2,0.2,Iris-setosa
4,5.7,4.4,1.5,0.4,Iris-setosa


In [11]:
# sort on specific set of columns
y \
    .sort("sepal_length") \
    .export_to_df(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,4.3,3.0,1.1,0.1,Iris-setosa
1,4.4,2.9,1.4,0.2,Iris-setosa
2,4.4,3.0,1.3,0.2,Iris-setosa
3,4.4,3.2,1.3,0.2,Iris-setosa
4,4.5,2.3,1.3,0.3,Iris-setosa


In [12]:
# lot of helper apis to do sampling. sample_rows takes 'n' rows randomly from the data
y \
    .sample_rows(5) \
    .export_to_df()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,2.5,3.0,1.1,Iris-versicolor
1,7.3,2.9,6.3,1.8,Iris-virginica
2,5.4,3.7,1.5,0.2,Iris-setosa
3,5.6,3.0,4.5,1.5,Iris-versicolor
4,7.4,2.8,6.1,1.9,Iris-virginica


In [13]:
# transpose method is helpful for viewing wide datasets having lots of columns
y \
    .transpose(10) \
    .export_to_df()

Unnamed: 0,col_name,row:0,row:1,row:2,row:3,row:4,row:5,row:6,row:7,row:8,row:9
0,sepal_length,5.1,4.9,4.7,4.6,5.0,5.4,4.6,5.0,4.4,4.9
1,sepal_width,3.5,3.0,3.2,3.1,3.6,3.9,3.4,3.4,2.9,3.1
2,petal_length,1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5,1.4,1.5
3,petal_width,0.2,0.2,0.2,0.2,0.2,0.4,0.3,0.2,0.2,0.1
4,label,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa


In [14]:
# reorder columns for ease of display. regexes are supported for column names where applicable.
# in case of multiple columns, the original relative order of columns is maintained
y \
    .reorder(["petal.*"]) \
    .export_to_df(5)

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width,label
0,1.4,0.2,5.1,3.5,Iris-setosa
1,1.4,0.2,4.9,3.0,Iris-setosa
2,1.3,0.2,4.7,3.2,Iris-setosa
3,1.5,0.2,4.6,3.1,Iris-setosa
4,1.4,0.2,5.0,3.6,Iris-setosa


In [17]:
# do multiple operations together in functional programming style with complex logic
y \
    .eq_str("label", "Iris-setosa") \
    .gt("sepal_length", 0.1) \
    .filter(["sepal_length", "sepal_width"], lambda x,y: float(x) - float(y) >= 1) \
    .select(["sepal.*"]) \
    .sort("sepal_width") \
    .export_to_df(5)

Unnamed: 0,sepal_length,sepal_width
0,4.5,2.3
1,4.4,2.9
2,4.9,3.0
3,4.8,3.0
4,4.3,3.0


In [16]:
# save the file to local file system or s3://
tsvutils.save_to_file(y, "output.tsv.gz")