# datatable
Datatable is a python library for manipulating tabular data. It supports out-of-memory datasets, multi-threaded data processing and has a flexible API
https://towardsdatascience.com/introducing-datatableton-python-datatable-tutorials-exercises-a0887f4323b0


In [2]:
#!pip install datatable

Collecting datatable
  Downloading datatable-1.0.0-cp39-cp39-win_amd64.whl (4.0 MB)
     ---------------------------------------- 4.0/4.0 MB 14.0 MB/s eta 0:00:00
Installing collected packages: datatable
Successfully installed datatable-1.0.0




In [6]:
import datatable as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
#sample data
data = dt.Frame(v1=range(10), v2=['Y', 'O', 'U', 'C', 'A', 'N', 'D', 'O', 'I', 'T'])

In [5]:
data

Unnamed: 0_level_0,v1,v2
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪
0,0,Y
1,1,O
2,2,U
3,3,C
4,4,A
5,5,N
6,6,D
7,7,O
8,8,I
9,9,T


In [8]:
#Pandas to/fro DT
dframe = pd.DataFrame({'v1': range(11), 'v2': ['N', 'E', 'V', 'E', 'R', 'G', 'I', 'V', 'E', 'U', 'P']})
dframe


Unnamed: 0,v1,v2
0,0,N
1,1,E
2,2,V
3,3,E
4,4,R
5,5,G
6,6,I
7,7,V
8,8,E
9,9,U


In [9]:
data_pd = dt.Frame(dframe)
data_pd

Unnamed: 0_level_0,v1,v2
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪
0,0,N
1,1,E
2,2,V
3,3,E
4,4,R
5,5,G
6,6,I
7,7,V
8,8,E
9,9,U


In [10]:
pd_data = data_pd.to_pandas()
pd_data

Unnamed: 0,v1,v2
0,0,N
1,1,E
2,2,V
3,3,E
4,4,R
5,5,G
6,6,I
7,7,V
8,8,E
9,9,U


## select rows, filter, Missing values

In [None]:
data = dt.fread('datatableton_sample.csv')

data_upq = data[:, ['user', 'product', 'quantity']]

## Aggregate

In [12]:
from sklearn.datasets import load_wine
from seaborn import load_dataset

In [13]:
data = dt.Frame(load_wine(as_frame=True).frame)
data.mean()

Unnamed: 0_level_0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,13.0006,2.33635,2.36652,19.4949,99.7416,2.29511,2.02927,0.361854,1.5909,5.05809,0.957449,2.61169,746.893,0.938202


## Grouping
Aggregating metrics grouped by features
Comparing column statistics grouped by features
Combining groupings with filtering and sorting

In [14]:
data = dt.Frame(load_dataset('penguins'))
data.replace('NA', None)
data[:, dt.median(dt.f.body_mass_g), dt.by([dt.f.species, dt.f.sex])]

Unnamed: 0_level_0,species,sex,body_mass_g
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪
0,Adelie,,3475.0
1,Adelie,Female,3400.0
2,Adelie,Male,4000.0
3,Chinstrap,Female,3550.0
4,Chinstrap,Male,3950.0
5,Gentoo,,4687.5
6,Gentoo,Female,4700.0
7,Gentoo,Male,5500.0


## Multiple Frames
Read, rbind, cbind multiple frames
Join frames using single or multiple keys
Union, intersection, difference of frames

In [None]:
data = list(dt.iread('datatableton_sample.zip'))

orders_jan = data[1]
orders_feb = data[0]
orders_mar = data[2]
orders_all = dt.rbind(orders_jan, orders_feb, orders_mar)
returns = data[3]

orders_all.key = 'Order ID'
sales = returns[:, :, dt.join(orders_all)]

## Time Series
Extracting and creating date/time features
Creating lag and lead variables within/without groups
Calculating difference of dates/timestamps

In [None]:
data = dt.fread('datatableton_sample.csv')
data['previous_timestamp'] = dt.shift(dt.f.timestamp, n=1)

## FTRL
Initialization and hyperparameters of FTRL model
Training and scoring a FTRL model
Perform k-fold cross validation

In [None]:
from datatable.models import Ftrl

data = dt.fread('kdd_ctr.csv', fill=True)[1:,:]
target = data['click']
del data['click']

model_ftrl = Ftrl()
model_ftrl.fit(data, target)