In [1]:
# %load_ext autoreload
# %autoreload 2
import pandas as pd
from atrax import Dataset, Series
from atrax import Atrax as tx
tx.version


'0.0.33'

# Building your own library

We are going to build our own version of Pandas. We are going to start with the Series object and for every feature that we add, we are going to compare that to the actual Pandas equivalent

## other

### csv

In [7]:
CSV_DATA = """id,name,sales,date
1,Alice,100.5,2024-07-01
2,Bob,200.0,2024-07-02
3,Charlie,150.25,2024-07-03
"""
ds = tx.read_csv(CSV_DATA, from_string=True)

In [8]:
ds

Unnamed: 0,id,name,sales,date
0,1,Alice,100.5,2024-07-01
1,2,Bob,200.0,2024-07-02
2,3,Charlie,150.25,2024-07-03


In [6]:
ds[1]['sales']

200.0

In [9]:
ds.data

[{'id': 1, 'name': 'Alice', 'sales': 100.5, 'date': '2024-07-01'},
 {'id': 2, 'name': 'Bob', 'sales': 200.0, 'date': '2024-07-02'},
 {'id': 3, 'name': 'Charlie', 'sales': 150.25, 'date': '2024-07-03'}]

### to_datetime

In [2]:
dte = tx.to_datetime('2025-01-01')
type(dte)

datetime.datetime

In [3]:
dte

datetime.datetime(2025, 1, 1, 0, 0)

In [4]:
dte = tx.to_datetime('1/1/2025', fmt='%m/%d/%Y')
dte

datetime.datetime(2025, 1, 1, 0, 0)

### date_range

In [3]:
r1 = tx.date_range('1/1/2025', '1/5/2025', fmt='%m/%d/%Y')
r1

[datetime.datetime(2025, 1, 1, 0, 0),
 datetime.datetime(2025, 1, 2, 0, 0),
 datetime.datetime(2025, 1, 3, 0, 0),
 datetime.datetime(2025, 1, 4, 0, 0),
 datetime.datetime(2025, 1, 5, 0, 0)]

In [3]:
r2 = tx.date_range(start='2025-01-01', periods=3, freq='D')
r2

[datetime.datetime(2025, 1, 1, 0, 0),
 datetime.datetime(2025, 1, 2, 0, 0),
 datetime.datetime(2025, 1, 3, 0, 0)]

## Series

In [2]:
s = Series([1,2,3,4, 5, 6, 7, 8, 9, 10, 11], name='nums')
ps = pd.Series([1,2,3,4, 5, 6, 7, 8, 9, 10, 11], name='pands nums')

### Basic Implementation

##### what triggers _repr_html_

When you evaluate an object at the end of a cell in Jupyter or IPython, the display machinery kicks in and calls one of the following (in priority order)

- _repr_mimebundle_
- _repr_html_
- _repr_svg_
- _repr_png_
- _repr_jpeg_
- _repr_latex
- _repr_json_
- _repr_javascript_
- _repr_markdown_
- _repr_pdf
- __repr__ or __str__

#### displaying a Series

In [3]:
s

0,1
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


In [4]:
ps

0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
Name: pands nums, dtype: int64

In [5]:
data = [1,2,3,4]
all(isinstance(x, int) for x in data)


True

In [7]:
s.dtype

'int64'

#### head and tail

In [5]:
s.head(2)

0,1
0,1
1,2
"Name: nums, dtype: int64","Name: nums, dtype: int64"


In [6]:
s.tail(3)

0,1
8,9
9,10
10,11
"Name: nums, dtype: int64","Name: nums, dtype: int64"


### Getting values by indexers

#### iloc

In [8]:
s.iloc[3]

4

In [9]:
s = Series([1,2,3], name='example', index=['a', 'b', 'c'])
s.iloc[1:3]


0,1
b,2
c,3
"Name: example, dtype: int64","Name: example, dtype: int64"


In [10]:
ps.iloc[3]

np.int64(4)

In [11]:
s.iloc[0:3]

0,1
a,1
b,2
c,3
"Name: example, dtype: int64","Name: example, dtype: int64"


In [12]:
ps.iloc[0:3]

0    1
1    2
2    3
Name: pands nums, dtype: int64

#### loc

In [13]:
named_series = Series([1,2,3,4], name='named_series', index=['a', 'b', 'c', 'd'])
pd_named_series = pd.Series([1,2,3,4], name='pd named_series', index=['a', 'b', 'c', 'd'])


In [14]:
named_series

0,1
a,1
b,2
c,3
d,4
"Name: named_series, dtype: int64","Name: named_series, dtype: int64"


In [15]:
pd_named_series

a    1
b    2
c    3
d    4
Name: pd named_series, dtype: int64

In [16]:
pd_named_series.loc['a']

np.int64(1)

In [17]:
named_series.loc['a']

1

In [18]:
pd_named_series.loc['a': 'c']

a    1
b    2
c    3
Name: pd named_series, dtype: int64

In [19]:
named_series.loc['a': 'c']

0,1
a,1
b,2
c,3
"Name: named_series, dtype: int64","Name: named_series, dtype: int64"


In [20]:
pd_named_series.loc[['a', 'c']]

a    1
c    3
Name: pd named_series, dtype: int64

In [7]:
named_series.loc[['a', 'c']]

0,1
a,1
c,3
"Name: named_series, dtype: int64","Name: named_series, dtype: int64"


##### list exploration

In [10]:
my_list = [1,2,3,4,5]
my_indexes = ['a', 'b', 'c', 'd', 'e']
key = ['a', 'c']

index_map = {k:v for k, v in zip(my_indexes, my_list)}
print(index_map)
print([index_map[k] for k in key])


{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
[1, 3]


##### slice exploration

In [8]:
my_slice = slice(1,5)
print(my_slice)

slice(1, 5, None)


In [9]:
my_slice.start

1

In [11]:
my_slice.stop

5

### Basic Math

#### add

In [3]:
s = Series([1,2,3,4], name='nums', index=['a', 'b', 'c', 'd'])
s1 = Series([5,6,7,8], name='other', index=['e', 'f', 'g', 'h'])
s3 = Series([9, 10])

In [4]:
s + 1

0,1
0,2
1,3
2,4
3,5
"Name: nums, dtype: int64","Name: nums, dtype: int64"


In [None]:
s + s1

0,1
0,6
1,8
2,10
3,12
"Name: nums, dtype: int64","Name: nums, dtype: int64"


In [16]:
# this should throw an error becuase the lengths are different
#s1 + s3

#### sub

In [4]:
s - 1

0,1
0,0
1,1
2,2
3,3
"Name: nums, dtype: int64","Name: nums, dtype: int64"


In [5]:
s - s1

0,1
0,-4
1,-4
2,-4
3,-4
"Name: nums, dtype: int64","Name: nums, dtype: int64"


#### lambda exploration

In [9]:
adder = lambda a, b: a + b

In [10]:
print(adder(2,3))

5


In [11]:
data = [1,2,3]
other = [1,2,3]
def bin_op(other, op):
    return [op(a,b) for a,b in zip(data, other)]

bin_op(data, adder)

[2, 4, 6]

In [14]:
bin_op(data, lambda a,b: a + b)

[2, 4, 6]

### Reverse Math

In [3]:
s = Series([1,2,3,4], name='rv', index=['a', 'b', 'c', 'd'])

In [4]:
s + 1

0,1
0,2
1,3
2,4
3,5
"Name: rv, dtype: int64","Name: rv, dtype: int64"


In [5]:
1 + s

0,1
0,2
1,3
2,4
3,5
"Name: rv, dtype: int64","Name: rv, dtype: int64"


### Comparisons

In [3]:
s = Series([1, 2, 3, 4, 5, 6, 7, 8, 9], name='nums')
ps = pd.Series([1,2,3,4,5,6,7, 8, 9], name='pd_nums')

In [4]:
s > 4

0,1
0,False
1,False
2,False
3,False
4,True
5,True
6,True
7,True
8,True
"Name: nums > 4, dtype: bool","Name: nums > 4, dtype: bool"


In [5]:
ps > 4

0    False
1    False
2    False
3    False
4     True
5     True
6     True
7     True
8     True
Name: pd_nums, dtype: bool

### Chaining

In [37]:
s = Series([1,2,3,4,5,6,7,8,9])
pd_s = pd.Series([1,2,3,4,5,6,7,8,9])

In [26]:
(s > 2) & (s < 8)


0,1
0,False
1,False
2,True
3,True
4,True
5,True
6,True
7,False
8,False
"Name: , dtype: bool","Name: , dtype: bool"


In [27]:
(pd_s > 2) & (pd_s < 8)

0    False
1    False
2     True
3     True
4     True
5     True
6     True
7    False
8    False
dtype: bool

In [None]:
(pd_s > 1) & 4 # we dont want this, its bitwise operators happening

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
dtype: bool

In [29]:
(s > 1) & 4

TypeError: Operand must be a Series

In [30]:
pd_s2 = pd.Series([1,2,3])
(pd_s > 1) & (pd_s2 < 3)

0    False
1     True
2    False
3    False
4    False
5    False
6    False
7    False
8    False
dtype: bool

In [31]:
s2 = Series([1,2])
(s > 1) & (s2 < 3)

ValueError: Cannot perform operation. Series must have the same length

In [38]:
(s > 5) | (s == 2)

0,1
0,False
1,True
2,False
3,False
4,False
5,True
6,True
7,True
8,True
"Name: , dtype: bool","Name: , dtype: bool"


### Unique

In [9]:
s = Series([1, 2, 2, 3, 3, 3, 4, 5, 6, 6, 7, 8, 9])
u = s.unique()

In [11]:
u.to_list()

AttributeError: 'numpy.ndarray' object has no attribute 'to_list'

#### unique

In [4]:
pd_s = pd.Series([1, 2, 2, 3, 3, 3, 4, 5, 6, 6, 7, 8, 9])
pd_s.unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [5]:
s.nunique()

9

#### nunique

number of unique records

In [7]:
pd_s.nunique()

9

### apply

In [3]:
def square(x):
    return x **2

In [4]:
s = Series([1,2,3])
pd_s = pd.Series([1,2,3])

##### mine

In [7]:
s_apply = s.apply(square)

In [8]:
type(s_apply)

src.Series.series.Series

In [9]:
s_apply

0,1
0,1
1,4
2,9
"Name: , dtype: int64","Name: , dtype: int64"


##### pandas

In [10]:
pd_apply = pd_s.apply(square)

In [11]:
type(pd_apply)

pandas.core.series.Series

In [12]:
pd_apply

0    1
1    4
2    9
dtype: int64

##### extras

In [14]:
s

0,1
0,1
1,2
2,3
"Name: , dtype: int64","Name: , dtype: int64"


In [15]:
s.apply(lambda x: x > 2)

0,1
0,False
1,False
2,True
"Name: , dtype: bool","Name: , dtype: bool"


### map

In [5]:
s = Series([1,2,3], name='example', index=['a', 'b', 'c'])
mapping = {1: 'one', 2: 'two', 3: 'three'}
mapped_s = s.map(mapping)
mapped_s

0,1
a,one
b,two
c,three
"Name: example, dtype: str","Name: example, dtype: str"


In [6]:
mapped_s.data

['one', 'two', 'three']

In [4]:
s.map(lambda x: x * 2)

0,1
a,2
b,4
c,6
"Name: example, dtype: int64","Name: example, dtype: int64"


### astype

In [3]:
s = Series([1,2,3])
c = s.astype('float')
c

0,1
0,1.0
1,2.0
2,3.0
"Name: , dtype: float64","Name: , dtype: float64"


In [4]:
ps = pd.Series([1,2,3])
c = ps.astype('float')
c

0    1.0
1    2.0
2    3.0
dtype: float64

In [6]:
float(1)

1.0

### stats

In [2]:
s = Series([1,2,3, 4])


In [4]:
s.mean()

2.5

In [5]:
round(s.std(),2)

1.29

In [6]:
s.var()

1.6666666666666667

In [7]:
s.var(sample=False)

1.25

In [3]:
s.prod()

24

In [4]:
s.cumsum().data

[1, 3, 6, 10]

In [6]:
s.cumprod().data

[1, 2, 6, 24]

In [10]:
s1 = Series([3, 2, 5, 1, 4])
s1.cummin().data

[3, 2, 2, 1, 1]

In [11]:
s1.cummax().data

[3, 3, 5, 5, 5]

## Dataset

#### imports and data

In [2]:

from test_data import data

In [5]:
#data

### instantiation

In [None]:
#data = {'a': [1, 2, 3], 'b': [4, 5, 6]}

In [3]:
ds = Dataset(data)
ds

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats
5,20000200000,Sirloin Steak,21.99,19.99,meat,3,lb,Matts meats
6,7780522546,American Cheese,4.99,4.49,dairy,4,each,Milky Moes
7,7780580235,1 Gallon Milk,3.99,2.99,dairy,4,each,Milky Moes


In [4]:
#ds.data

In [28]:
ds.columns

['product_code',
 'description',
 'price',
 'cost',
 'department',
 'department_id',
 'type',
 'vendor_name']

In [10]:
data = [{'a': i, 'b': i * 2} for i in range(15)]
ds = Dataset(data)
output = repr(ds)
output

'<Dataset />\na, b\n0, 0\n1, 2\n2, 4\n3, 6\n4, 8\n5, 10\n6, 12\n7, 14\n8, 16\n9, 18\n...(15) total rows'

In [7]:
data = {'a': [1,3], 'b': [2,4]}
ds = Dataset(data)

In [6]:
pds = pd.DataFrame(data)
pds

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [4]:
ds.len()

8

In [5]:
len(ds)

8

#### exploration

##### check the lengths

In [37]:
# check to make sure all columns have same length
len(set([len(v) for v in data.values()]))

1

##### create the dataset

In [6]:
# each key represents a column and each value is a list of column values
keys = list(data.keys())
keys

['a', 'b']

In [None]:
data.values()

dict_values([[1, 2, 3], [4, 5, 6]])

In [None]:
values = zip(*data.values())
values

<zip at 0x2a9806f3a40>

In [None]:
# *data.values() unpacks this into separate arguments to zip
zip([1, 2, 3], [4, 5, 6])

<zip at 0x2a9806f0140>

this should look like this:

values = [
    (1,4),
    (2,5),
    (3,6)
]

In [None]:
output = [dict(zip(keys, row)) for row in values]
print(output)
# this zips ['a', 'b'] with (1,4) -> {'a': 1, 'b': 4}, etc

[{'a': 1, 'b': 4}, {'a': 2, 'b': 5}, {'a': 3, 'b': 6}]


In [None]:
# simplified version
columns = {'a': [1,2,3], 'b': [4, 5, 6]}
rows = []
for row_values in zip(*columns.values()):
    print(row_values)
    row_dict = dict(zip(columns.keys(), row_values))
    rows.append(row_dict)

rows

(1, 4)
(2, 5)
(3, 6)


[{'a': 1, 'b': 4}, {'a': 2, 'b': 5}, {'a': 3, 'b': 6}]

### stats

In [9]:
ds = Dataset(data)
pds = pd.DataFrame(data)

In [19]:
#ds

##### head and tail

In [5]:
ds.head(2)

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us


In [6]:
ds.tail(2)

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
6,7780522546,American Cheese,4.99,4.49,diary,4,each,Milky Moes
7,7780580235,1 Gallon Mile,3.99,2.99,diary,4,each,Milky Moes


In [9]:
pds = pd.DataFrame(data)
pds.head()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats


In [10]:
pds.tail()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats
5,20000200000,Sirloin Steak,21.99,19.99,meat,3,lb,Matts meats
6,7780522546,American Cheese,4.99,4.49,diary,4,each,Milky Moes
7,7780580235,1 Gallon Mile,3.99,2.99,diary,4,each,Milky Moes


##### describe

In [10]:
ds.shape()

(8, 8)

In [11]:
pds.shape

(8, 8)

In [23]:
ds.columns

['product_code',
 'description',
 'price',
 'cost',
 'department',
 'department_id',
 'type',
 'vendor_name']

In [25]:
pds.columns

Index(['product_code', 'description', 'price', 'cost', 'department',
       'department_id', 'type', 'vendor_name'],
      dtype='object')

In [4]:
ds.describe()

Unnamed: 0,stat,product_code,description,price,cost,department,department_id,type,vendor_name
0,mean,,,7.7,6.87,,2.5,,
1,std,,,6.33,5.98,,1.2,,
2,min,,,2.49,1.99,,1.0,,
3,Q1,,,4.95,3.99,,2.0,,
4,median,,,4.97,4.24,,2.5,,
5,Q3,,,6.99,6.49,,3.0,,
6,max,,,21.99,19.99,,4.0,,
7,count,,,8.0,8.0,,8.0,,


In [5]:
ds.describe(numeric_only=True)

Unnamed: 0,stat,price,cost,department_id
0,mean,7.7,6.87,2.5
1,std,6.33,5.98,1.2
2,min,2.49,1.99,1.0
3,Q1,4.95,3.99,2.0
4,median,4.97,4.24,2.5
5,Q3,6.99,6.49,3.0
6,max,21.99,19.99,4.0
7,count,8.0,8.0,8.0


In [26]:
pds.describe()

Unnamed: 0,price,cost,department_id
count,8.0,8.0,8.0
mean,7.7,6.865,2.5
std,6.332321,5.980623,1.195229
min,2.49,1.99,1.0
25%,4.71,3.74,1.75
50%,4.97,4.24,2.5
75%,8.055,7.615,3.25
max,21.99,19.99,4.0


In [16]:
ds.info()

<class 'Dataset'>
Range Index: 8 entries
Data columns (total 8 columns):
Column          | Dtype      | Non-Null   | Total     
--------------------------------------------------
product_code    | str        | 8          | 8         
description     | str        | 8          | 8         
price           | float      | 8          | 8         
cost            | float      | 8          | 8         
department      | str        | 8          | 8         
department_id   | int        | 8          | 8         
type            | str        | 8          | 8         
vendor_name     | str        | 8          | 8         


In [10]:
pds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_code   8 non-null      object 
 1   description    8 non-null      object 
 2   price          8 non-null      float64
 3   cost           8 non-null      float64
 4   department     8 non-null      object 
 5   department_id  8 non-null      int64  
 6   type           8 non-null      object 
 7   vendor_name    8 non-null      object 
dtypes: float64(2), int64(1), object(5)
memory usage: 644.0+ bytes


In [19]:
ds = Dataset([
    {"a": 1}, {"a": 2}, {"a": 3}
])
head = ds.head()
head.shape()

(3, 1)

In [21]:
ds = Dataset([{'x': 1}, {'x': 3}, {'x': 5}])
summary = ds.describe()
summary.data

[{'stat': 'mean', 'x': 3},
 {'stat': 'std', 'x': 2.0},
 {'stat': 'min', 'x': 1},
 {'stat': 'Q1', 'x': 1},
 {'stat': 'median', 'x': 3},
 {'stat': 'Q3', 'x': 5},
 {'stat': 'max', 'x': 5},
 {'stat': 'count', 'x': 3}]

In [24]:
ds = Dataset([
    {
        'id': 1,
        'name': 'sally'
    },
    {
        'id': 2,
        'name': 'billy'
    }
])
summary = ds.describe(numeric_only=True)
summary.data

[{'stat': 'mean', 'id': 1.5},
 {'stat': 'std', 'id': 0.71},
 {'stat': 'min', 'id': 1},
 {'stat': 'Q1', 'id': 1},
 {'stat': 'median', 'id': 1.5},
 {'stat': 'Q3', 'id': 2},
 {'stat': 'max', 'id': 2},
 {'stat': 'count', 'id': 2}]

In [25]:
ds = Dataset([
    {'a': 1, 'b': 'hello'},
    {'a': 2, 'b': 'world'},
    {'a': None, 'b': '!'}
])
ds.info()

<class 'Dataset'>
Range Index: 3 entries
Data columns (total 2 columns):
Column          | Dtype      | Non-Null   | Total     
--------------------------------------------------
a               | int        | 2          | 3         
b               | str        | 3          | 3         


In [27]:
ds = Dataset([])
ds.info()

<class 'Dataset'>
Range Index: 0 entries
Data columns (total 0 columns):
   No data available


In [28]:
from datetime import datetime
ds = Dataset([
    {
        'id': 1,
        'name': 'mike',
        'successfull': False,
        'died': datetime(2026, 1, 1),
        'max_pee_amount_in_gallons': 12.25
    }
]) 
ds.info() 

<class 'Dataset'>
Range Index: 1 entries
Data columns (total 5 columns):
Column          | Dtype      | Non-Null   | Total     
--------------------------------------------------
id              | int        | 1          | 1         
name            | str        | 1          | 1         
successfull     | bool       | 1          | 1         
died            | datetime   | 1          | 1         
max_pee_amount_in_gallons | float      | 1          | 1         


##### exploration

In [7]:
{
    col: [row[col] for row in data if isinstance(row.get(col), (int, float))] for col in ds.columns
}

{'product_code': [],
 'description': [],
 'price': [4.95, 4.95, 6.99, 2.49, 11.25, 21.99, 4.99, 3.99],
 'cost': [3.99, 3.99, 6.49, 1.99, 10.99, 19.99, 4.49, 2.99],
 'department': [],
 'department_id': [1, 1, 2, 2, 3, 3, 4, 4],
 'type': [],
 'vendor_name': []}

In [9]:
numeric_data = {
    col: [row[col] for row in data if isinstance(row.get(col), (int, float))]
    for col in ds.columns
    if any(isinstance(row.get(col), (int, float)) for row in data)
}
numeric_data

{'price': [4.95, 4.95, 6.99, 2.49, 11.25, 21.99, 4.99, 3.99],
 'cost': [3.99, 3.99, 6.49, 1.99, 10.99, 19.99, 4.49, 2.99],
 'department_id': [1, 1, 2, 2, 3, 3, 4, 4]}

### indexers

In [3]:
ds = Dataset(data)
ds.head()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats


##### iloc

In [None]:
# give me the second row
ds.iloc[1] # single row

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us


In [None]:
ds.iloc[1:3] # rows 1..2, all columns

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys


In [54]:
ds.iloc[:,0] # all rows, first column

Unnamed: 0,product_code
0,2663380147
1,2663358001
2,3644920475
3,3644989541
4,20000100000
5,20000200000
6,7780522546
7,7780580235


In [39]:
ds.iloc[[0, 3], [1]] # rows 0 and 3, 2nd column

IndexError: list index out of range

In [None]:
ds.iloc[:, [0, 1, 2]] # all rows, and just the first 3 columns

Unnamed: 0,product_code,description,price
0,2663380147,Salt and Pepper Chips,4.95
1,2663358001,Barbecue Chips,4.95
2,3644920475,Toilet Paper,6.99
3,3644989541,Tissue Paper,2.49
4,20000100000,Pork Chops,11.25
5,20000200000,Sirloin Steak,21.99
6,7780522546,American Cheese,4.99
7,7780580235,1 Gallon Mile,3.99


In [None]:
ds.iloc[:4, [0, 1, 2]] # give me the first 4 rows and only these columns

Unnamed: 0,product_code,description,price
0,2663380147,Salt and Pepper Chips,4.95
1,2663358001,Barbecue Chips,4.95
2,3644920475,Toilet Paper,6.99
3,3644989541,Tissue Paper,2.49


In [None]:
ds.iloc[:4, (0, 1, 2)] # this uses a tuple to do the same thing

Unnamed: 0,product_code,description,price
0,2663380147,Salt and Pepper Chips,4.95
1,2663358001,Barbecue Chips,4.95
2,3644920475,Toilet Paper,6.99
3,3644989541,Tissue Paper,2.49


##### test data

In [31]:
ds = Dataset({
    'name': ['alice', 'bob', 'charlie'],
    'age': [25, 30, 35],
    'score': [88.5, 92.0, 95.0]
}, index=['a', 'b', 'c'])

In [40]:
ds.head()

Unnamed: 0,name,age,score
0,alice,25,88.5
1,bob,30,92.0
2,charlie,35,95.0


In [34]:
r = ds.iloc[0]
r.data

[{'name': 'alice', 'age': 25, 'score': 88.5}]

In [36]:
r = ds.iloc[1:3] # rows 1..2, all columns
r.data

[{'name': 'bob', 'age': 30, 'score': 92.0},
 {'name': 'charlie', 'age': 35, 'score': 95.0}]

In [37]:
r = ds.iloc[:,0] # all rows, first column
r.data

[{'name': 'alice'}, {'name': 'bob'}, {'name': 'charlie'}]

In [41]:
r = ds.iloc[[0, 2], [1]] # rows 0 and 2, 2nd column
r.data

[{'age': 25}, {'age': 35}]

In [42]:
r = ds.iloc[:, [0, 1]] # all rows, and just the first 2 columns
r.data

[{'name': 'alice', 'age': 25},
 {'name': 'bob', 'age': 30},
 {'name': 'charlie', 'age': 35}]

In [45]:
r = ds.iloc[:, (0, 1)] # this uses a tuple to do the same thing
r.data

[{'name': 'alice', 'age': 25},
 {'name': 'bob', 'age': 30},
 {'name': 'charlie', 'age': 35}]

##### loc

In [4]:
ds.loc['0']

In [2]:
ds = Dataset({
    'name': ['alice', 'bob', 'charlie'],
    'age': [25, 30, 35],
    'score': [88.5, 92.0, 95.0]
}, index=['a', 'b', 'c'])

In [3]:
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'score': [88.5, 92.0, 95.0]
}, index=['a', 'b', 'c'])

In [4]:
ds.head()

Unnamed: 0,name,age,score
0,alice,25,88.5
1,bob,30,92.0
2,charlie,35,95.0


In [5]:
ds.loc['a']

Unnamed: 0,name,age,score
a,alice,25,88.5


In [46]:
# test
r = ds.loc['a']
r.data

[{'name': 'alice', 'age': 25, 'score': 88.5}]

In [6]:
df.loc['a']

name     Alice
age         25
score     88.5
Name: a, dtype: object

In [7]:
ds.loc[['a', 'c']]

Unnamed: 0,name,age,score
a,alice,25,88.5
c,charlie,35,95.0


In [47]:
# test
r = ds.loc[['a', 'c']]
r.data

[{'name': 'alice', 'age': 25, 'score': 88.5},
 {'name': 'charlie', 'age': 35, 'score': 95.0}]

In [8]:
df.loc[['a', 'c']]

Unnamed: 0,name,age,score
a,Alice,25,88.5
c,Charlie,35,95.0


In [9]:
ds.loc['b', 'score']

Unnamed: 0,score
b,92.0


In [48]:
# test
r = ds.loc['b', 'score']
r.data

[{'score': 92.0}]

In [35]:
df.loc['b', 'score']

np.float64(92.0)

In [11]:
ds.loc['a': 'd']

Unnamed: 0,name,age,score
a,alice,25,88.5
b,bob,30,92.0
c,charlie,35,95.0


In [49]:
# test
r = ds.loc['a': 'c']
r.data

[{'name': 'alice', 'age': 25, 'score': 88.5},
 {'name': 'bob', 'age': 30, 'score': 92.0},
 {'name': 'charlie', 'age': 35, 'score': 95.0}]

In [12]:
df.loc['a': 'd']

Unnamed: 0,name,age,score
a,Alice,25,88.5
b,Bob,30,92.0
c,Charlie,35,95.0


In [14]:
ds.loc[['a', 'b'], ['name', 'score']]

Unnamed: 0,name,score
a,alice,88.5
b,bob,92.0


In [50]:
# test
r = ds.loc[['a', 'b'], ['name', 'score']]
r.data

[{'name': 'alice', 'score': 88.5}, {'name': 'bob', 'score': 92.0}]

In [15]:
df.loc[['a', 'c'], ['name', 'score']]

Unnamed: 0,name,score
a,Alice,88.5
c,Charlie,95.0


In [56]:
r = ds.loc[['a'], None]
r.columns

['name', 'age', 'score']

In [57]:
ds.loc[lambda row: row['price'] > 5]

KeyError: 'price'

In [18]:
ds = Dataset(data)
ds.head()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats


In [19]:
ds.loc['0']

In [20]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats


### get_item

In [4]:
ds

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats
5,20000200000,Sirloin Steak,21.99,19.99,meat,3,lb,Matts meats
6,7780522546,American Cheese,4.99,4.49,dairy,4,each,Milky Moes
7,7780580235,1 Gallon Milk,3.99,2.99,dairy,4,each,Milky Moes


In [7]:
mask = ds['department'] == 'meat'
mask

0,1
0,False
1,False
2,False
3,False
4,True
5,True
6,False
7,False
"Name: department, dtype: bool","Name: department, dtype: bool"


In [None]:
ds.loc[mask, ['product_code', 'description', 'price']]

Unnamed: 0,product_code,description,price
4,20000100000,Pork Chops,11.25
5,20000200000,Sirloin Steak,21.99


In [8]:
r = ds.loc[mask, ['product_code', 'description', 'price']]
r.data

[{'product_code': '0020000100000',
  'description': 'Pork Chops',
  'price': 11.25},
 {'product_code': '0020000200000',
  'description': 'Sirloin Steak',
  'price': 21.99}]

In [8]:
ds.loc[ds['price'] > 5, ['product_code', 'description', 'price']]

Unnamed: 0,product_code,description,price
2,3644920475,Toilet Paper,6.99
4,20000100000,Pork Chops,11.25
5,20000200000,Sirloin Steak,21.99


In [5]:
mask1 = ds['department'] == 'meat'
mask2 = ds['price'] > 5
ds.loc[mask1 & mask2, ['product_code', 'description', 'price']]

Unnamed: 0,product_code,description,price
4,20000100000,Pork Chops,11.25
5,20000200000,Sirloin Steak,21.99


### set item

In [35]:
ds.head()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name,total,overall,sale_type
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us,5.445,overall,normal
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us,5.445,overall,normal
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys,7.689000000000001,overall,expensive
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys,2.7390000000000003,overall,normal
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats,12.375000000000002,overall,holy shit


In [18]:
ds['total'] = lambda row: row['price'] * 1.1

In [23]:
ds['sale_type'] = 'normal'

In [31]:
ds.loc[ds['price'] > 5, 'sale_type'] = 'expensive'

In [33]:
ds.loc[ds['price'] > 10, 'sale_type'] = lambda row: f"holy shit"

In [34]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name,total,overall,sale_type
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us,5.445,overall,normal
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us,5.445,overall,normal
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys,7.689,overall,expensive
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys,2.739,overall,normal
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats,12.375,overall,holy shit


In [29]:
df.loc[df['price'] > 5, 'sale_type'] = 'expensive'

In [30]:
df.head()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name,total,overall,sale_type
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us,5.445,overall,normal
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us,5.445,overall,normal
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys,7.689,overall,expensive
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys,2.739,overall,normal
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats,12.375,overall,expensive


##### tests

In [36]:
ds = Dataset([
    {'a': 1}, {'a': 2}, {'a': 3}
]) 
mask = Series([True, False, True])
ds.loc[mask, 'flag'] = ['x', 'y']
ds

Unnamed: 0,a,flag
0,1,x
1,2,
2,3,y


In [5]:
ds = Dataset([
        {'a': 10}, {'a': 20}, {'a': 30}
    ])

In [6]:
mask = Series([False, True, True])

In [7]:
val_series = Series([100, 200])
ds.loc[mask, 'bonus'] = val_series

### set index

In [37]:
data = [
    {'id': 1, 'cat': 'A', 'val': 10},
    {'id': 2, 'cat': 'B', 'val': 20},
    {'id': 3, 'cat': 'A', 'val': 30},
]
ds = Dataset(data)

In [42]:
ds.head()

Unnamed: 0,cat,val
0,A,10
1,B,20
2,A,30


In [39]:
ret = ds.set_index('id', inplace=True)
ret

In [40]:
ds._index

[1, 2, 3]

In [27]:
ds._index_name

'id'

In [14]:
f = lambda row: row["cat"].lower()

In [15]:
ds.set_index(f, inplace=True)

In [16]:
ds._index

['a', 'b', 'a']

### drop

In [5]:
ds.head()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats


In [6]:
# go clean this up in the code to make it work
ds.info()

<class 'Dataset'>
Range Index: 8 entries
Data columns (total 8 columns):
Column          | Dtype      | Non-Null   | Total     
--------------------------------------------------
product_code    | str        | 8          | 8         
description     | str        | 8          | 8         
price           | float      | 8          | 8         
cost            | float      | 8          | 8         
department      | str        | 8          | 8         
department_id   | int        | 8          | 8         
type            | str        | 8          | 8         
vendor_name     | str        | 8          | 8         


In [2]:
data = [
    {"id": 1, "cat": "A", "val": 10},
    {"id": 2, "cat": "B", "val": 20},
    {"id": 3, "cat": "C", "val": 30},
]
ds = Dataset(data)
ds.head()

Unnamed: 0,id,cat,val
0,1,A,10
1,2,B,20
2,3,C,30


In [3]:
ds.set_index("id", inplace=True)

In [4]:
# test 1
ds.drop(index=[1], inplace=True)
ds

keep_mask: [True, False, True]


id,cat,val
1,A,10
3,C,30


In [5]:
ds._index

[1, 3]

In [15]:
# test 2
ds1 = ds.drop(index=[1], inplace=False)
ds1

id,cat,val
1,A,10
2,C,30


In [16]:
ds

id,cat,val
1,A,10
2,B,20
3,C,30


In [17]:
# test drop columns in place
ds2 = ds.drop(columns=['cat'], inplace=False)
ds2


id,val
1,10
2,20
3,30


In [18]:
ds.drop(columns=['cat'], inplace=True)
ds

id,val
1,10
2,20
3,30


In [21]:
# test drop columns and rows
ds.drop(columns=['cat'], index=[1], inplace=True)
ds

id,val
1,10
2,30


In [8]:
my_list = [1,2,3, 4, 5, 6]
del_list = [1, 3]
# remove elements in my_list that are in del_list

[i for i in my_list if i-1 not in del_list]

[1, 3, 5, 6]

In [6]:
data = [
    {'id': 1, 'cat': 'A', 'val': 10},
    {'id': 2, 'cat': 'B', 'val': 20},
    {'id': 3, 'cat': 'C', 'val': 30},
]
ds = Dataset(data)
ds.set_index('id', inplace=True)
ds.columns


['cat', 'val']

### rename

In [2]:
ds = Dataset([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
ds.rename(columns={'a': 'x', 'b': 'y'}, inplace=True)
ds

Unnamed: 0,x,y
0,1,2
1,3,4


### groupby

In [2]:
ds = Dataset([
    {'store': 'A', 'sales': 100, 'returns': 5},
    {'store': 'A', 'sales': 150, 'returns': 3},
    {'store': 'B', 'sales': 200, 'returns': 2},
    {'store': 'B', 'sales': 100, 'returns': 4},
    {'store': 'B', 'sales': 150, 'returns': 1} 
])

In [3]:
type(ds)

atrax.Dataset.dataset.Dataset

In [4]:
ds.groupby('store')

<atrax.Dataset.group.GroupBy at 0x22426a22ed0>

In [8]:
result = ds.groupby('store').size()
result

Unnamed: 0,store,size
0,A,2
1,B,3


In [5]:
ds = Dataset(data)
ds.head()

Unnamed: 0,product_code,description,price,cost,department,department_id,type,vendor_name
0,2663380147,Salt and Pepper Chips,4.95,3.99,grocery,1,each,Chips r Us
1,2663358001,Barbecue Chips,4.95,3.99,grocery,1,each,Chips r Us
2,3644920475,Toilet Paper,6.99,6.49,non food,2,each,Home Comfys
3,3644989541,Tissue Paper,2.49,1.99,non food,2,each,Home Comfys
4,20000100000,Pork Chops,11.25,10.99,meat,3,lb,Matts meats


In [14]:
ds.groupby(by=['product_code', 'description'], sort=True).sum()

Unnamed: 0,price_sum,department_id_sum,cost_sum,product_code,description
0,4.95,1,3.99,2663358001,Barbecue Chips
1,4.95,1,3.99,2663380147,Salt and Pepper Chips
2,6.99,2,6.49,3644920475,Toilet Paper
3,2.49,2,1.99,3644989541,Tissue Paper
4,4.99,4,4.49,7780522546,American Cheese
5,3.99,4,2.99,7780580235,1 Gallon Milk
6,11.25,3,10.99,20000100000,Pork Chops
7,21.99,3,19.99,20000200000,Sirloin Steak


In [10]:
ds.groupby(by=['product_code', 'description']).agg(
    price_total=('price', 'sum'),
    cost_total=('cost','avg')
    )

Unnamed: 0,price_total,cost_total,product_code,description
0,4.95,3.99,2663380147,Salt and Pepper Chips
1,4.95,3.99,2663358001,Barbecue Chips
2,6.99,6.49,3644920475,Toilet Paper
3,2.49,1.99,3644989541,Tissue Paper
4,11.25,10.99,20000100000,Pork Chops
5,21.99,19.99,20000200000,Sirloin Steak
6,4.99,4.49,7780522546,American Cheese
7,3.99,2.99,7780580235,1 Gallon Milk


In [17]:
def custom(rows):
    return {
        'custom_price': round(sum(r['price'] for r in rows) ** 2, 2),
        'custom_cost': round(sum(r['cost'] for r in rows) ** 2, 2)
        
        }

In [18]:
ds.groupby(by=['product_code', 'description']).apply(custom)

Unnamed: 0,custom_price,custom_cost,product_code,description
0,24.5,15.92,2663380147,Salt and Pepper Chips
1,24.5,15.92,2663358001,Barbecue Chips
2,48.86,42.12,3644920475,Toilet Paper
3,6.2,3.96,3644989541,Tissue Paper
4,126.56,120.78,20000100000,Pork Chops
5,483.56,399.6,20000200000,Sirloin Steak
6,24.9,20.16,7780522546,American Cheese
7,15.92,8.94,7780580235,1 Gallon Milk


In [9]:
def add_bonus(rows):
    return [{'price': r['price'], 'bonus': round(r['price'] * 0.1, 2)} for r in rows]

In [10]:
ds.groupby(by=['product_code', 'description']).transform(add_bonus)

Unnamed: 0,price,bonus,product_code,description
0,4.95,0.5,2663380147,Salt and Pepper Chips
1,4.95,0.5,2663358001,Barbecue Chips
2,6.99,0.7,3644920475,Toilet Paper
3,2.49,0.25,3644989541,Tissue Paper
4,11.25,1.12,20000100000,Pork Chops
5,21.99,2.2,20000200000,Sirloin Steak
6,4.99,0.5,7780522546,American Cheese
7,3.99,0.4,7780580235,1 Gallon Milk


In [11]:
ds.groupby(by=['product_code', 'description']).describe()

Unnamed: 0,price_count,price_mean,price_min,price_max,department_id_count,department_id_mean,department_id_min,department_id_max,cost_count,cost_mean,cost_min,cost_max,product_code,description
0,1,4.95,4.95,4.95,1,1.0,1,1,1,3.99,3.99,3.99,2663380147,Salt and Pepper Chips
1,1,4.95,4.95,4.95,1,1.0,1,1,1,3.99,3.99,3.99,2663358001,Barbecue Chips
2,1,6.99,6.99,6.99,1,2.0,2,2,1,6.49,6.49,6.49,3644920475,Toilet Paper
3,1,2.49,2.49,2.49,1,2.0,2,2,1,1.99,1.99,1.99,3644989541,Tissue Paper
4,1,11.25,11.25,11.25,1,3.0,3,3,1,10.99,10.99,10.99,20000100000,Pork Chops
5,1,21.99,21.99,21.99,1,3.0,3,3,1,19.99,19.99,19.99,20000200000,Sirloin Steak
6,1,4.99,4.99,4.99,1,4.0,4,4,1,4.49,4.49,4.49,7780522546,American Cheese
7,1,3.99,3.99,3.99,1,4.0,4,4,1,2.99,2.99,2.99,7780580235,1 Gallon Milk


### sort

In [3]:
data = [
    {
        'id': 1,
        'name': 'one'
    },
    {
        'id': 2,
        'name': 'two'
    },
    {
        'id': 3,
        'name': 'three'
    },
    {
        'id': 4,
        'name': 'four'
    }
]

In [4]:
ds = Dataset(data)
ds.head()

Unnamed: 0,id,name
0,1,one
1,2,two
2,3,three
3,4,four


In [6]:
ds.sort_values(by='id', ascending=True)

Unnamed: 0,id,name
0,1,one
1,2,two
2,3,three
3,4,four


In [2]:
data = [
    {"store": "B", "sales": 100, "profit": 20},
    {"store": "A", "sales": 200, "profit": 40},
    {"store": "C", "sales": None, "profit": 10},
    {"store": "A", "sales": 200, "profit": 30},
]

In [3]:
ds = Dataset(data)
ds.head()

Unnamed: 0,store,sales,profit
0,B,100.0,20
1,A,200.0,40
2,C,,10
3,A,200.0,30


In [5]:
ds.sort_values(by='store', ascending=True)

Unnamed: 0,store,sales,profit
0,A,200.0,40
1,A,200.0,30
2,B,100.0,20
3,C,,10


### mean

In [None]:
ds = Dataset([
    {'a': 1, 'b': 2, 'c': 3},
    {'a': 4, 'b': 5, 'c': 6},
    {'a': 7, 'b': 8, 'c': 9}
])

print(ds.mean(axis=0))  # {'a': 4.0, 'b': 5.0, 'c': 6.0}
print(ds.mean(axis=1))  # [2, 5, 8]

{'a': 4, 'b': 5, 'c': 6}
[2, 5, 8]


### preprocessing

In [2]:
ds = tx.read_csv('sales_multi_store.csv')

In [3]:
ds.head()

Unnamed: 0,store_number,store_name,sale_date,total_sales
0,1,IGA 1,2025-06-01,23651.720000000696
1,2,IGA #002,2025-06-01,12867.159999999953
2,440,Buy Low #440,2025-06-01,31503.62999999985
3,1,IGA 1,2025-06-02,22397.15000000042
4,2,IGA #002,2025-06-02,8687.549999999876


In [4]:
ds['sale_date'] = tx.to_datetime(ds['sale_date'])

[{'total_sales_sum': 370595.01000000443,
  'store_number_sum': 0,
  'store_number': 1},
 {'total_sales_sum': 195088.4299999987,
  'store_number_sum': 0,
  'store_number': 2},
 {'total_sales_sum': 443000.4000000093,
  'store_number_sum': 0,
  'store_number': 440}]

In [6]:
ds.groupby('store_number')['total_sales'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

AttributeError: 'list' object has no attribute 'rolling'

In [None]:
ds['trend'] = ds.groupby('store_number')['total_sales'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

In [2]:
ds = Dataset([
        {'store': 'A', 'sales': 100, 'returns': 5},
        {'store': 'A', 'sales': 150, 'returns': 3},
        {'store': 'B', 'sales': 200, 'returns': 2},
        {'store': 'B', 'sales': 100, 'returns': 4},
        {'store': 'B', 'sales': 150, 'returns': 1},        
    ])
grouped = ds.groupby('store')
d = grouped.last().data
d

[{'store_last': 'A', 'sales_last': 150, 'returns_last': 3, 'store': 'A'},
 {'store_last': 'B', 'sales_last': 150, 'returns_last': 1, 'store': 'B'}]

In [3]:
d[0]['sales_last']

150

In [4]:
data = [
    {'store_number': 1, 'total_sales': 100, 'items_sold': 10},
    {'store_number': 1, 'total_sales': 150, 'items_sold': 15},
    {'store_number': 2, 'total_sales': 200, 'items_sold': 20},
    {'store_number': 2, 'total_sales': 50,  'items_sold': 5}
]  
ds = Dataset(data)
result = ds.groupby('store_number')['total_sales'].sum()
result

{1: 250, 2: 250}

In [5]:
result[1]

250

In [6]:
result = ds.groupby('store_number')['total_sales'].agg('mean')
result

{1: 125.0, 2: 125.0}

In [7]:
result = ds.groupby('store_number')['total_sales'].agg(lambda x: max(x) - min(x))
result

{1: 50, 2: 150}

In [18]:
result = ds.groupby('store_number')['total_sales'].transform(lambda x: [val / sum(x) for val in x])
result.data

[{'total_sales': 0.4, 'store_number': 1},
 {'total_sales': 0.6, 'store_number': 1},
 {'total_sales': 0.8, 'store_number': 2},
 {'total_sales': 0.2, 'store_number': 2}]

In [17]:
result.data[0]['total_sales']

1.0

In [9]:
ds.groupby('store_number')['some_column']

<atrax.Dataset.group.GroupBy at 0x13003ed0110>

## Date Accessor

In [2]:
test_data = [
            {
                'id': 1,
                'sale_date': '1/1/2025'
            },
            {
                'id': 2,
                'sale_date': '1/2/2025'
            },
            {
                'id': 3,
                'sale_date': '1/3/2025'
            }
        ]

In [3]:
ds = Dataset(test_data)
ds.head()

Unnamed: 0,id,sale_date
0,1,1/1/2025
1,2,1/2/2025
2,3,1/3/2025


In [6]:
ds['weekday'] = ds['sale_date'].dt.weekday
ds

Unnamed: 0,id,sale_date,weekday
0,1,1/1/2025,2
1,2,1/2/2025,3
2,3,1/3/2025,4


In [7]:
ds['is_weekend'] = ds['sale_date'].dt.is_weekend.astype(int)
ds

Unnamed: 0,id,sale_date,weekday,is_weekend
0,1,1/1/2025,2,0
1,2,1/2/2025,3,0
2,3,1/3/2025,4,0


In [8]:
ds['month'] = ds['sale_date'].dt.month
ds

Unnamed: 0,id,sale_date,weekday,is_weekend,month
0,1,1/1/2025,2,0,1
1,2,1/2/2025,3,0,1
2,3,1/3/2025,4,0,1


In [9]:
ds['day'] = ds['sale_date'].dt.day
ds['year'] = ds['sale_date'].dt.year
ds

Unnamed: 0,id,sale_date,weekday,is_weekend,month,day,year
0,1,1/1/2025,2,0,1,1,2025
1,2,1/2/2025,3,0,1,2,2025
2,3,1/3/2025,4,0,1,3,2025


In [10]:
from datetime import datetime

In [11]:
d1 = datetime(2025,1,1)
d2 = datetime(2025,1,2)
s = tx.Series([d1, d2])
s.dt.day.data

[1, 2]

In [4]:
ds['dayofweek'] = ds['sale_date'].dt.dayofweek
ds

Unnamed: 0,id,sale_date,dayofweek
0,1,1/1/2025,2
1,2,1/2/2025,3
2,3,1/3/2025,4
