# Good-bye Pandas! Meet Terality - Its Evil Twin With Identical Syntax
## ... but +30 times faster
![](images/unsplash.jpg)

## Setup

In [1]:
import logging
import time
import warnings

import catboost as cb
import datatable as dt
import joblib
import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import shap
import umap
import umap.plot
import xgboost as xgb
from optuna.samplers import TPESampler
from sklearn.compose import *
from sklearn.impute import *
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import *

logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%d-%b-%y %H:%M:%S", level=logging.INFO
)
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore")
pd.set_option("float_format", "{:.5f}".format)

# Introduction

# Preparing a dataset

```python
import pandas as pd

df = pd.read_csv("data/train.csv")
large_df = df.sample(6 * 10 ** 7, replace=True)  # 60 million rows

large_df.to_parquet(
    "data/tps_may_large.parquet", row_group_size=len(df) // 15, engine="pyarrow"
)  # Save to parquet format
```

In [1]:
from pathlib import Path

size = Path("data/tps_may_large.parquet").stat().st_size
size_in_gb = size / 1024 ** 3

round(size_in_gb, 2)  # size in gb

7.1

## Pandas vs. Terality - data loading benchmark

In [3]:
import pandas as pd

In [4]:
%%time

df = pd.read_parquet("data/tps_may_large.parquet")
df.head()

Wall time: 56.3 s


Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
182422,304374,B,I,C,A,E,BI,A,E,AE,...,0.314827,0.337578,0.588459,0.812049,0.399413,0.347415,0.362455,0.479648,0.388591,0
293498,489146,A,I,F,A,H,BI,A,AD,AT,...,0.785501,0.262564,0.797093,0.618762,0.685029,0.516098,0.698928,0.647654,0.393393,0
220335,367472,B,I,A,C,F,AB,A,AH,AY,...,0.472732,0.274317,0.239821,0.734716,0.477748,0.458747,0.339953,0.565235,0.41739,0
215241,358783,A,I,A,A,H,BI,A,AN,AD,...,0.866463,0.745388,0.668823,0.61945,0.547709,0.828951,0.815386,0.363989,0.638822,0
265936,443252,A,J,C,A,E,T,A,A,AX,...,0.308595,0.309102,0.803394,0.299911,0.55434,0.332101,0.243223,0.257364,0.314352,0


In [5]:
df.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60000000 entries, 182422 to 262725
Data columns (total 32 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   cat0    object 
 2   cat1    object 
 3   cat2    object 
 4   cat3    object 
 5   cat4    object 
 6   cat5    object 
 7   cat6    object 
 8   cat7    object 
 9   cat8    object 
 10  cat9    object 
 11  cat10   object 
 12  cat11   object 
 13  cat12   object 
 14  cat13   object 
 15  cat14   object 
 16  cat15   object 
 17  cat16   object 
 18  cat17   object 
 19  cat18   object 
 20  cont0   float64
 21  cont1   float64
 22  cont2   float64
 23  cont3   float64
 24  cont4   float64
 25  cont5   float64
 26  cont6   float64
 27  cont7   float64
 28  cont8   float64
 29  cont9   float64
 30  cont10  float64
 31  target  int64  
dtypes: float64(11), int64(2), object(19)
memory usage: 14.8+ GB
