In [1]:
%load_ext autoreload
%autoreload 2


In [1]:
from atrax import Atrax as tx
import random

ModuleNotFoundError: No module named 'core'

#### fake data

In [3]:
items = [
    {
        'transaction_id':1,
        'product_code': '4011',
        'product_description': 'Bananas',
        'qty': 5,
        'sales': 10.00
    },
    {
        'transaction_id':1,
        'product_code': '1',
        'product_description': 'Bread',
        'qty': 5,
        'sales': 10.00
    },
    {
        'transaction_id':1,
        'product_code': '2',
        'product_description': 'Milk',
        'qty': 5,
        'sales': 10.00
    },
    {
        'transaction_id':1,
        'product_code': '3',
        'product_description': 'Cookies',
        'qty': 5,
        'sales': 10.00
    },  
    {
        'transaction_id':1,
        'product_code': '4',
        'product_description': 'Hot Dogs',
        'qty': 5,
        'sales': 10.00
    }, 
    {
        'transaction_id':1,
        'product_code': '5',
        'product_description': 'HD Buns',
        'qty': 5,
        'sales': 10.00
    }, 
    {
        'transaction_id':1,
        'product_code': '6',
        'product_description': 'Ketchup',
        'qty': 5,
        'sales': 10.00
    }, 
    {
        'transaction_id':1,
        'product_code': '7',
        'product_description': 'Mustard',
        'qty': 5,
        'sales': 10.00
    },  
    {
        'transaction_id':1,
        'product_code': '8',
        'product_description': 'Pickles',
        'qty': 5,
        'sales': 10.00
    },    
    {
        'transaction_id':1,
        'product_code': '9',
        'product_description': 'Relish',
        'qty': 5,
        'sales': 10.00
    },  
    {
        'transaction_id':1,
        'product_code': '10',
        'product_description': 'HB Buns',
        'qty': 5,
        'sales': 10.00
    },    
    {
        'transaction_id':1,
        'product_code': '11',
        'product_description': 'Hamburgers',
        'qty': 5,
        'sales': 10.00
    },   
    {
        'transaction_id':1,
        'product_code': '12',
        'product_description': 'Cheese',
        'qty': 5,
        'sales': 10.00
    }, 
    {
        'transaction_id':1,
        'product_code': '13',
        'product_description': 'Beer',
        'qty': 5,
        'sales': 10.00
    },    
    {
        'transaction_id':1,
        'product_code': '14',
        'product_description': 'Wine',
        'qty': 5,
        'sales': 10.00
    },   
    {
        'transaction_id':1,
        'product_code': '15',
        'product_description': 'Diapers',
        'qty': 5,
        'sales': 10.00
    },                                    

]

#### generate transactions

In [4]:
def generate_transactions(base_items, dtc=10, max_items=8, mqpi=10, minia=.99, maxia=5.99):
    """Generate a random set of transactions using a base dataset of items to choose from.
    
    Args:
        base_items (list): list containing the base dataset of items.
        dtc (int): Desired Transaction Count - Number of transactions to generate.
        max_items (int): Maximum number of items per transaction.
        mqpi (int): Maximum quantity per item.
        minia (float): Minimum item price for each item.
        maxia (float): Maximum item price for each item.

    """

    transaction_list = []
    for tx_id in range(1, dtc + 1):
        num_items = random.randint(1,max_items)

        for _ in range(num_items):
            item = random.choice(base_items)
            qty = random.randint(1, mqpi)
            price_per_unit = round(random.uniform(minia, maxia), 2)
            sales = round(qty * price_per_unit, 2)

            transaction_list.append({
                'transaction_id': tx_id,
                'product_code': item['product_code'],
                'product_description': item['product_description'],
                'qty': qty,
                'price_per_unit': price_per_unit,
                'sales': sales
            })
    return tx.DataSet(transaction_list)

### cut

- What it does: Unlike qcut, which ensures equal-sized quantile groups, cut splits the range into equal-width intervals.
- Cool use: Helps in scenarios where absolute value ranges matter more than distribution.

In [5]:
from atrax.core.cut import cut

💹 #### Example 1: Segmenting customers by age range

In [4]:
ages = [19, 23, 37, 45, 50, 61, 70, 82]
age_bins = [0, 20, 50, 100]
age_labels = ['young', 'middle-ages', 'senior']

In [6]:
cut(ages, bins=age_bins, labels=age_labels, tie_breaker='upper')

['young',
 'middle-ages',
 'middle-ages',
 'middle-ages',
 'senior',
 'senior',
 'senior',
 'senior']

💹 #### Example 2: Equal-width binning for normalization prep

This is good for histogram prep without needing scaling

In [7]:
sales = [0, 20, 50, 75, 110, 130, 170, 200]
binned_sales = cut(sales, bins=4)
binned_sales

[0, 0, 1, 1, 2, 2, 3, 3]

💹 #### Example 3: Risk Level Classification

In [8]:
cholesterol = [120, 140, 160, 190, 210, 250]
risk_bins = [0, 160, 20, 300]
risk_labels = ['Low', 'Moderate', 'High']
cut(cholesterol, bins=risk_bins, labels=risk_labels, tie_breaker='lower')

['Low', 'Low', 'Low', 'High', 'High', 'High']

#### cut inside of Series

In [10]:
s = tx.Series([19, 23, 37, 45, 50, 61, 70, 82], name="age")
bins = [0, 30, 50, 100]
labels = ['young', 'middle-ages', 'senior']
s_cut = s.cut(bins=bins, labels=labels, tie_breaker='upper')
s_cut

0,1
0,young
1,young
2,middle-ages
3,middle-ages
4,senior
5,senior
6,senior
7,senior
"Name: age, dtype: object","Name: age, dtype: object"


In [6]:
transactions = generate_transactions(items, dtc=50)

In [9]:
transactions.info()

<class 'atrax.Atrax'>
columns (total 6):
total rows: 225
Column          | Type       | Non-Null   | Total     
--------------------------------------------------
transaction_id  | int        | 225        | 225
product_code    | str        | 225        | 225
product_description | str        | 225        | 225
qty             | int        | 225        | 225
price_per_unit  | float      | 225        | 225
sales           | float      | 225        | 225


In [10]:

transactions.head()

transaction_id,product_code,product_description,qty,price_per_unit,sales
1,15,Diapers,1,1.46,1.46
2,15,Diapers,10,1.7,17.0
3,7,Mustard,8,4.28,34.24
3,12,Cheese,5,3.98,19.9
3,9,Relish,9,5.06,45.54


In [11]:
def tame_sum(series):
    return round(sum([v if v is not None else 0 for v in series]), 2)

In [50]:
g1 = transactions.groupby(by=['product_code']).agg({
    'product_description': 'first',
    'qty': 'sum',
    'sales': tame_sum,
})

In [51]:
# group by product_code and sum the sales and qty
g2 = transactions.groupby('product_code').agg(
    description= ('product_description', 'first'),
    qty= ('qty', 'sum'),
    sales= ('sales', tame_sum),
    max_sales=('sales', 'max'),
    min_sales=('sales', 'min'),
    mean_sales=('sales', 'mean'),    
)

In [55]:
g2['segment'] = cut(
    g2['sales'], 
    bins=4, 
    labels=['Poor Mover', 'Average Mover', 'Decent Mover', 'Excellent Mover'], 
    tie_breaker='upper')


In [56]:
g2.sort(by='sales', ascending=False)

description,qty,sales,max_sales,min_sales,mean_sales,product_code,segment
Wine,103,447.98,49.23,2.52,26.351764705882356,14,Excellent Mover
Beer,121,400.77,39.48,2.32,18.216818181818184,13,Excellent Mover
Relish,84,369.06,46.17,16.8,30.755,9,Decent Mover
Cheese,104,363.98,46.32,5.06,21.41058823529412,12,Decent Mover
Milk,103,322.22,40.5,2.82,24.786153846153844,2,Decent Mover
Pickles,83,320.46,56.9,3.33,24.650769230769235,8,Decent Mover
Mustard,99,300.42,45.5,1.83,15.021,7,Average Mover
Bread,83,273.51,58.6,4.76,19.53642857142857,1,Average Mover
Hamburgers,85,267.36,45.6,4.18,20.56615384615385,11,Average Mover
HB Buns,53,263.68,57.2,8.88,32.96,10,Average Mover


### qcut

In [35]:
from atrax.core.qcut import qcut

In [36]:
data = [1, 2, 3, 4, 5, 6, 7, 8]
result = qcut(data, q=4)
result

[0, 0, 0, 1, 1, 2, 2, 3]

#### rank

In [57]:
s = tx.Series([50, 20, 20, 100])

s.rank(method='average')

0,1
0,3.0
1,1.5
2,1.5
3,4.0
"Name: _rank, dtype: float","Name: _rank, dtype: float"


In [58]:
s.rank(method='min')

0,1
0,3
1,1
2,1
3,4
"Name: _rank, dtype: int","Name: _rank, dtype: int"


In [59]:
s.rank(method='max')

0,1
0,3
1,2
2,2
3,4
"Name: _rank, dtype: int","Name: _rank, dtype: int"


In [60]:
s.rank(method='first')

0,1
0,3
1,1
2,2
3,4
"Name: _rank, dtype: int","Name: _rank, dtype: int"


In [61]:
s.rank(method='dense')

0,1
0,2
1,1
2,1
3,3
"Name: _rank, dtype: int","Name: _rank, dtype: int"


#### map

In [75]:
s = tx.Series([1,2,3,4], name="nums")
squared = s.map(lambda x: x**2)
squared

0,1
0,1
1,4
2,9
3,16
"Name: nums_mapped, dtype: int","Name: nums_mapped, dtype: int"


In [76]:
labels = s.map({1: 'A', 2: 'B', 3: 'C', 4: 'D'})
labels

0,1
0,A
1,B
2,C
3,D
"Name: nums_mapped, dtype: object","Name: nums_mapped, dtype: object"


In [77]:
s = tx.Series([1, 2, 3], name="x")
result = s.map(lambda x: x * 10)
result

0,1
0,10
1,20
2,30
"Name: x_mapped, dtype: int","Name: x_mapped, dtype: int"


#### quantile

In [80]:
s = tx.Series([10, 20, 30, 40, 50])
s.quantile(q=0.5)
s.percentile(p=25)

20.0