In [1]:
import pandas as pd

df = pd.read_csv("../data/raw/players.csv")

print(df.head())
print(df.columns)



   player_id  timestamp   action behavior_type
0          1          1      die       quitter
1          2          1    fight      explorer
2          2          2    fight      explorer
3          2          3  explore      explorer
4          2          4  explore      explorer
Index(['player_id', 'timestamp', 'action', 'behavior_type'], dtype='object')


In [2]:
df = df.sort_values(by=["player_id", "timestamp"])


In [3]:
player_sequences = (
    df.groupby("player_id")["action"]
      .apply(list)
)

player_sequences.head()


player_id
1                                                [die]
2    [fight, fight, explore, explore, move, move, m...
3                                   [move, shoot, die]
4                                                [win]
5                                                [die]
Name: action, dtype: object

In [4]:
ACTIONS = [
    "move", "shoot", "hide", "reload",
    "explore", "loot", "fight",
    "die", "win", "quit"
]

ACTION_TO_ID = {action: idx + 1 for idx, action in enumerate(ACTIONS)}
ID_TO_ACTION = {idx + 1: action for idx, action in enumerate(ACTIONS)}

PAD_TOKEN = 0


In [5]:
encoded_sequences = player_sequences.apply(
    lambda seq: [ACTION_TO_ID[a] for a in seq]
)

print(encoded_sequences.head())


player_id
1                                  [8]
2    [7, 7, 5, 5, 1, 1, 1, 6, 7, 5, 9]
3                            [1, 2, 8]
4                                  [9]
5                                  [8]
Name: action, dtype: object


In [6]:
MAX_LEN = 20
num_players = len(encoded_sequences)


In [7]:
import numpy as np

padded_sequences = np.zeros((len(encoded_sequences), MAX_LEN), dtype=int)

for i, seq in enumerate(encoded_sequences):
    length = min(len(seq), MAX_LEN)
    padded_sequences[i, :length] = seq[:length]


In [8]:
mask = (padded_sequences != PAD_TOKEN).astype(np.int64)


In [9]:
print("Padded shape:", padded_sequences.shape)
print("Mask shape:", mask.shape)

print("Sample padded sequence:", padded_sequences[0])
print("Sample mask:", mask[0])


Padded shape: (100, 20)
Mask shape: (100, 20)
Sample padded sequence: [8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Sample mask: [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [10]:
np.save("../data/processed/sequences.npy", padded_sequences)
np.save("../data/processed/masks.npy", mask)

print("Saved sequences.npy and masks.npy")


Saved sequences.npy and masks.npy
