# Large DataSets Scaling (LDS) - Load less data, less atrributes, less columns

In [7]:
import pandas as pd
import numpy as np
import pyarrow as pa
from pyarrow.parquet import ParquetFile

## Open a file to scale - Only few rows

In [14]:
pf = ParquetFile('timeseries_wide.parquet')              # avisa: let take care of a parquet file
first_ten_rows = next(pf.iter_batches(batch_size = 10))  # iterator for 10 first rows of a parquet file
df = pa.Table.from_batches([first_ten_rows]).to_pandas() # build a small dadtaframe

## Observe the file

In [12]:
df

Unnamed: 0_level_0,id_0,name_0,x_0,y_0,id_1,name_1,x_1,y_1,id_2,name_2,...,x_7,y_7,id_8,name_8,x_8,y_8,id_9,name_9,x_9,y_9
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-01 00:00:00,977,Alice,-0.821225,0.906222,975,Bob,-0.288451,-0.215082,1047,Alice,...,-0.371775,0.697468,1048,Alice,0.403201,-0.756503,1025,Charlie,-0.957208,-0.757508
2024-01-01 00:01:00,1018,Bob,-0.219182,0.350855,1032,Alice,0.919521,-0.338915,1043,Bob,...,-0.570205,-0.473155,1037,Bob,-0.690994,-0.623366,981,Alice,-0.414445,-0.100298
2024-01-01 00:02:00,927,Alice,0.660908,-0.798511,967,Alice,0.628664,0.763875,963,Alice,...,-0.690044,-0.912261,987,Bob,0.656727,0.579849,923,Charlie,-0.325838,0.581859
2024-01-01 00:03:00,997,Bob,-0.852458,0.73526,1021,Bob,0.995494,0.514133,952,Charlie,...,-0.397596,0.248303,1013,Bob,-0.132701,-0.173416,1042,Bob,0.992033,-0.686692
2024-01-01 00:04:00,965,Bob,0.717283,0.393391,1011,Bob,-0.143403,-0.282985,973,Charlie,...,0.574683,-0.764567,1010,Charlie,-0.741446,-0.886785,964,Charlie,-0.924556,-0.184161
2024-01-01 00:05:00,980,Charlie,-0.39255,-0.016465,1063,Alice,0.692896,-0.714686,1008,Alice,...,-0.188629,-0.123436,969,Alice,0.22552,-0.672073,1023,Alice,-0.507769,-0.238156
2024-01-01 00:06:00,1039,Alice,-0.29095,0.84527,975,Alice,0.884074,0.369863,956,Charlie,...,-0.769733,-0.219983,1040,Alice,-0.932465,-0.042156,1017,Charlie,-0.383519,0.02962
2024-01-01 00:07:00,1004,Charlie,-0.456968,-0.058761,1046,Bob,-0.947066,-0.864121,1050,Bob,...,-0.55811,-0.634002,1075,Alice,-0.647389,0.598738,975,Bob,0.861001,0.211859
2024-01-01 00:08:00,974,Alice,-0.430249,-0.705546,986,Alice,-0.120741,0.40255,965,Bob,...,-0.690822,0.252864,984,Bob,0.895757,-0.135687,1032,Bob,-0.723231,-0.326186
2024-01-01 00:09:00,1009,Alice,-0.36304,-0.436554,1048,Bob,0.671477,-0.26482,1002,Charlie,...,0.653947,0.455432,992,Bob,-0.698999,0.792666,951,Charlie,-0.816291,0.578756


#### Note: The file has 40 columns. Let say we only need only a few columns.

## Choose only necessary data columns
Let say that we only need the gorup 0 of the id, name x y column groups

### Set needed columns

In [16]:
columns = ["id_0", "name_0", "x_0", "y_0"]

## Reed only necessary columns

In [17]:
df_needed =  pd.read_parquet("timeseries_wide.parquet", columns=columns)

In [18]:
df_needed

Unnamed: 0_level_0,id_0,name_0,x_0,y_0
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-01-01 00:00:00,977,Alice,-0.821225,0.906222
2024-01-01 00:01:00,1018,Bob,-0.219182,0.350855
2024-01-01 00:02:00,927,Alice,0.660908,-0.798511
2024-01-01 00:03:00,997,Bob,-0.852458,0.735260
2024-01-01 00:04:00,965,Bob,0.717283,0.393391
...,...,...,...,...
2024-12-30 23:56:00,1037,Bob,-0.814321,0.612836
2024-12-30 23:57:00,980,Bob,0.232195,-0.618828
2024-12-30 23:58:00,965,Alice,-0.231131,0.026310
2024-12-30 23:59:00,984,Alice,0.942819,0.853128


### Observation
The original file has 40 columns of which we only need the first 4

## Conclusion
Since we read only 4 of the 40 columns from the file, we have reduced the size of what was read to only 10% of the total. Saving 90% memory resources.