A column of binary numbers can be treated as bitfields, where each bit acts as a binary flag representing a feature, and bitwise operations with bitmasks allow efficient querying and manipulation of these flags for analysis.

In [1]:
import pandas as pd

In [None]:
def decode_flags_from_flags_dict(bitfield: int, flags: dict[str, int]) -> list[str]:
    return [name for name, value in flags.items() if bitfield & value]

In [None]:
from enum import IntFlag

def decode_flags(bitfield: int, flags: IntFlag) -> list[str]:
    return [name for name, value in flags.__members__.items() if bitfield & value]

### spec

In [None]:
df = pd.read_csv("input/titanic.csv")
print(f"{df.shape}")
print(df.iloc[0, :].to_string())

(891, 15)
survived                 0
pclass                   3
sex                   male
age                   22.0
sibsp                    1
parch                    0
fare                  7.25
embarked                 S
class                Third
who                    man
adult_male            True
deck                   NaN
embark_town    Southampton
alive                   no
alone                False


### binary flag

In [5]:
# define bit masks
has_survived = 0b0001
is_alone = 0b0010
is_male = 0b0100
age_below_18 = 0b1000

In [6]:
flag = 0b0110

In [7]:
# & : Bitwise AND
print(f"check: {flag & has_survived = }")
print(f"check: {flag & is_alone = }")
print(f"check: {flag & is_male = }")
print(f"check: {flag & age_below_18 = }")

check: flag & has_survived = 0
check: flag & is_alone = 2
check: flag & is_male = 4
check: flag & age_below_18 = 0


In [8]:
# | : Bitwise OR, set
bin(flag | has_survived)

'0b111'

In [9]:
# ~ : Bitwise NOT (inverts all bits)
bin(flag & ~0b0010)

'0b100'

In [10]:
bin(~0b0101)# display as negative binary

'-0b110'

In [11]:
# ^ : Bitwise XOR (exclusive OR), flips bits where mask is 1
bin(flag ^ 0b0100)

'0b10'

In [12]:
# << : Left shift
bin(flag << 1)

'0b1100'

In [13]:
bin(1 << 2)

'0b100'

In [14]:
# >> : Right shift
bin(flag >> 1)

'0b11'

In [15]:
bin(0b100 >> 2)

'0b1'

In [16]:
# operator precedence
# https://docs.python.org/3/reference/expressions.html#operator-precedence

# basics
# expo: **
# sign, bitnot: +x, -x, ~
# mul, divide: *, /, //, %
# plus minus: +, -
# bit shift, and, xor, or: <<, >>, &, ^, |
# comparison, inequality: in, not in, is, is not, <, >
# boolean: not, and, or

# basics for bitwise
# ~
# *, /, +, -
# shift, &, |
# comparison, inequality

In [17]:
bin(0b101 + 0b1 & 0b111 | 0b100 > 0b110)

'0b0'

### df

In [18]:
# define mapping
from enum import IntFlag

class Flags(IntFlag):
    HAS_SURVIVED = 1 << 0
    IS_ALONE = 1 << 1
    IS_MALE = 1 << 2
    AGE_BELOW_18 = 1 << 3

for flag in Flags:
    print(f"{flag.name}: {flag.value}")

HAS_SURVIVED: 1
IS_ALONE: 2
IS_MALE: 4
AGE_BELOW_18: 8


In [19]:
list(Flags)

[<Flags.HAS_SURVIVED: 1>,
 <Flags.IS_ALONE: 2>,
 <Flags.IS_MALE: 4>,
 <Flags.AGE_BELOW_18: 8>]

In [52]:
type(Flags.HAS_SURVIVED)

<flag 'Flags'>

In [20]:
# flag construction
df["flags"] = 0
df["flags"] |= df["survived"].mul(Flags.HAS_SURVIVED)
df["flags"] |= df["alone"].astype(bool).mul(Flags.IS_ALONE)
df["flags"] |= df["sex"].map({"male": 1, "female": 0}).mul(Flags.IS_MALE)
df["flags"] |= df["age"].lt(18).mul(Flags.AGE_BELOW_18)

df["flags"].value_counts().sort_index()

flags
0      39
1     105
2      25
3      90
4      97
5      23
6     336
7      63
8      15
9      29
10      2
11      9
12     24
13     22
14     11
15      1
Name: count, dtype: int64

In [21]:
# check
df['has_survived'] = df["survived"]
df['is_alone'] = df["alone"]
df['is_male'] = df["sex"].map({"male": 1, "female": 0})
df['age_below_18'] = df["age"].lt(18)

In [22]:
# display by decoding
df["flags_bin_display"] = df["flags"].apply(lambda x: bin(x)[2:].zfill(4))
df["flags_decoded"] = df["flags"].apply(lambda x: decode_flags(x, Flags))
sdf = (
    df.assign(flags_decoded_str=lambda x: x["flags_decoded"].astype(str))
    .loc[:, ["flags", "flags_bin_display", "flags_decoded_str"]]
    .value_counts()
    .sort_index()
    .reset_index()
)
sdf

Unnamed: 0,flags,flags_bin_display,flags_decoded_str,count
0,0,0,[],39
1,1,1,['HAS_SURVIVED'],105
2,2,10,['IS_ALONE'],25
3,3,11,"['HAS_SURVIVED', 'IS_ALONE']",90
4,4,100,['IS_MALE'],97
5,5,101,"['HAS_SURVIVED', 'IS_MALE']",23
6,6,110,"['IS_ALONE', 'IS_MALE']",336
7,7,111,"['HAS_SURVIVED', 'IS_ALONE', 'IS_MALE']",63
8,8,1000,['AGE_BELOW_18'],15
9,9,1001,"['HAS_SURVIVED', 'AGE_BELOW_18']",29


In [None]:
# filtering

In [54]:
res = df.loc[lambda x: x['has_survived'] == 1,:].shape
res3 = df.loc[lambda x: x["flags"] & int(Flags.HAS_SURVIVED) > 0, :].shape
assert res == res3

In [55]:
res = df.loc[lambda x:(x['has_survived'] == 1) & (x['is_male'] == 1),:].shape
res2 = df.loc[
    lambda x: x["flags"] & int(Flags.HAS_SURVIVED | Flags.IS_MALE) == Flags.HAS_SURVIVED | Flags.IS_MALE,
    :,
].shape
assert (res == res2)

In [56]:
res = df.loc[lambda x: (x["has_survived"] == 1) | (x["is_male"] == 1), :].shape
res3 = df.loc[
    lambda x: x["flags"] & int(Flags.HAS_SURVIVED | Flags.IS_MALE) > 0,
    :,
].shape
assert (res == res3)

In [57]:
res = df.loc[lambda x: (x["has_survived"] == 0), :].shape
res3 = df.loc[lambda x: x["flags"] & int(Flags.HAS_SURVIVED) == 0, :].shape
assert (res == res3)

### flagging funcs and actions

In [45]:
def action_1(df, res_col, flag_col):
    mask = lambda x: (x["age"] > 50) & (x["sex"] == "female")
    df.loc[mask, res_col] = 1
    df[flag_col] = 0
    df.loc[mask, flag_col] = 1


def action_2(df, res_col, flag_col):
    mask = lambda x: (x["age"] < 18) & (x["sex"] == "male")
    df.loc[mask, res_col] = 0
    df[flag_col] = 0
    df.loc[mask, flag_col] = 1

In [None]:
actions_sequence = [action_1, action_2]
res_col = "survived_2"
flag_col_suffix = "flag_"

# Apply actions
df[res_col] = df["survived"]
for i, action in enumerate(actions_sequence):
    action(df, res_col=res_col, flag_col=flag_col_suffix + str(i))

df[["survived", "survived_2", "flag_0", "flag_1"]].value_counts().reset_index()

Unnamed: 0,survived,survived_2,flag_0,flag_1,count
0,0,0,0,0,513
1,1,1,0,0,303
2,0,0,0,1,35
3,1,0,0,1,23
4,1,1,1,0,16
5,0,1,1,0,1


In [2]:
# store states as decimals
import pandas as pd

# Sample data
df = pd.DataFrame({'transition': [123, 321, 456, 654, 111]})

# Parameters
stage = 1  # e.g. extract the second stage from the right (stage 1)

# Extract stage using string slicing
df[f'stage_{stage}'] = df['transition'].astype(str).str.zfill(stage + 1).str[::-1].str[stage].astype(int)

df.head()

Unnamed: 0,transition,stage_1
0,123,2
1,321,2
2,456,5
3,654,5
4,111,1


In [4]:
# 1 bitfield to represent stages of cats
import pandas as pd

df = pd.DataFrame({'transition': [243, 321, 456, 654, 111]})
max_stages = df['transition'].astype(str).str.len().max()
bits_per_stage = 4  # supports up to 16 categories per stage

# Extract digits per stage
digits = df['transition'].astype(str).str.zfill(max_stages).str[::-1].apply(list)

def encode_composite_mask(digits, bits_per_stage):
    mask = 0
    for i, d in enumerate(digits):
        mask |= (1 << int(d)) << (bits_per_stage * i)
    return mask

df['composite_mask'] = digits.apply(lambda x: encode_composite_mask(x, bits_per_stage))
df.head()

Unnamed: 0,transition,composite_mask
0,243,1288
1,321,2114
2,456,4672
3,654,16912
4,111,546


In [5]:
# Check if stage 1 has category 4
stage = 1
category = 4
bit = (1 << category) << (bits_per_stage * stage)
df[df['composite_mask'] & bit > 0]

# Check if stage 0 has category 3 or 6
stage = 0
bit = ((1 << 3) | (1 << 6)) << (bits_per_stage * stage)
df[df['composite_mask'] & bit > 0]

Unnamed: 0,transition,composite_mask
0,243,1288
1,321,2114
2,456,4672
