In [12]:
import pandas as pd

df = pd.read_csv("../data/raw/players.csv")

print(df.head())
print(df.columns)



   player_id  timestamp   action
0          1          1     move
1          1          2     quit
2          2          1  explore
3          2          2     loot
4          2          3    fight
Index(['player_id', 'timestamp', 'action'], dtype='object')


In [4]:
df = df.sort_values(by=["player_id", "timestamp"])


In [5]:
player_sequences = (
    df.groupby("player_id")["action"]
      .apply(list)
)

player_sequences.head()


player_id
1                                         [move, quit]
2    [explore, loot, fight, explore, explore, move,...
3                             [loot, loot, move, quit]
4                                          [move, die]
5    [explore, explore, move, explore, explore, fig...
Name: action, dtype: object

In [14]:
ACTIONS = [
    "move", "shoot", "hide", "reload",
    "explore", "loot", "fight",
    "die", "win", "quit"
]

ACTION_TO_ID = {action: idx + 1 for idx, action in enumerate(ACTIONS)}
ID_TO_ACTION = {idx + 1: action for idx, action in enumerate(ACTIONS)}

PAD_TOKEN = 0


In [15]:
encoded_sequences = player_sequences.apply(
    lambda seq: [ACTION_TO_ID[a] for a in seq]
)

print(encoded_sequences.head())


player_id
1                                       [1, 10]
2       [5, 6, 7, 5, 5, 1, 7, 5, 5, 7, 5, 7, 9]
3                                 [6, 6, 1, 10]
4                                        [1, 8]
5    [5, 5, 1, 5, 5, 7, 5, 6, 5, 1, 6, 7, 6, 9]
Name: action, dtype: object


In [16]:
MAX_LEN = 20
num_players = len(encoded_sequences)


In [9]:
import numpy as np

padded_sequences = np.zeros((len(encoded_sequences), MAX_LEN), dtype=int)

for i, seq in enumerate(encoded_sequences):
    length = min(len(seq), MAX_LEN)
    padded_sequences[i, :length] = seq[:length]


In [17]:
mask = (padded_sequences != PAD_TOKEN).astype(np.int64)


In [18]:
print("Padded shape:", padded_sequences.shape)
print("Mask shape:", mask.shape)

print("Sample padded sequence:", padded_sequences[0])
print("Sample mask:", mask[0])


Padded shape: (100, 20)
Mask shape: (100, 20)
Sample padded sequence: [0 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Sample mask: [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [19]:
np.save("../data/processed/sequences.npy", padded_sequences)
np.save("../data/processed/masks.npy", mask)

print("Saved sequences.npy and masks.npy")


Saved sequences.npy and masks.npy
