In [380]:
import gzip
import json
import os
import time
import pandas as pd
from tqdm import tqdm
import glob

In [387]:
# Specify the directory containing .gz files
directory_path = 'C:\\Users\\alect\\Documents\\Python\\Fireball\\Data\\extracted\\anonymized\\data\\*.gz'

# Use glob to find all .gz files in the directory
gz_files = glob.glob(directory_path)

# Select a slice of files if desired
# gz_files = gz_files[:1000]

# Initializes feature lists
indices = []
party_sizes = []
party_avg_levels = []
monster_sizes = []
monster_avg_levels = []
combat_lengths = []

# Loops through files
for gz_file in tqdm(gz_files, desc="Processing files"):

    # Extracts index of current file
    index = gz_files.index(gz_file)

    # Stores current file in a dataframe
    json_obj = []
    with gzip.open(gz_file, 'rt', encoding='utf-8') as f:
        for line in f:
            json_obj.append(json.loads(line))

    jsons = pd.DataFrame(json_obj)

    # Checks if the combat ended, skipping this file if not
    combat_ended = 'combat_end' in jsons['event_type'].values

    if not combat_ended:
        continue

    # Creates dataframe with the data from each combat state update.
    # This includes a list of combatants. This combatant list includes
    # levels of enemies, but not of players, hence the next dataframe.
    data = jsons[jsons['event_type'] == 'combat_state_update']['data']

    # Creates a dataframe with all casters (characters who did some action).
    # This has level info for the players.
    casters = jsons[jsons['caster'].notnull()]['caster']

    # Initialize sets for monsters and characters so we count each instance once.
    characters = set()
    monsters = set()

    # Extracts characters and their total level for each caster.
    for caster in casters:
        # Extracts name and level.
        character = caster.get('name')
        level = caster.get('levels')

        # If level is not null, check if it's a monster or character, and
        # add to the correct set.
        if level:
            if caster.get('monster_name'):
                monsters.add((character, caster.get('monster_name'),
                              level.get('total_level')))
            else:
                characters.add((character, level.get('total_level')))
    
    # Extracts number of rounds and monsters that never appeared in casters.
    for datum in data:
        # This number will only increase, and eventually update to the final
        # recorded round.
        num_rounds = datum['round']

        # Loops through combatant list.
        for i in range(len(datum['combatants'])):
            dict = datum['combatants'][i]

            # If this dictionary has level info, then this is a monster.
            if dict.get('levels'):
                monsters.add((dict.get('name'), dict.get('monster_name'),
                              dict.get('levels').get('total_level')))

    # Check that at least one monster and character exists.
    if len(monsters) == 0 or len(characters) == 0:
        continue
    
    # Compute party info and add to feature lists.
    party_size = len(characters)
    party_level_sum = sum([x[1] for x in characters])
    party_avg_level = party_level_sum / party_size

    party_sizes.append(party_size)
    party_avg_levels.append(party_avg_level)

    # Compute monster info and add to feature lists.
    monster_size = len(monsters)
    monster_level_sum = sum([x[2] for x in monsters])
    monster_avg_level = monster_level_sum / monster_size

    monster_sizes.append(monster_size)
    monster_avg_levels.append(monster_avg_level)

    # Could alter this to include some sort of fractional part?
    combat_length = num_rounds

    combat_lengths.append(combat_length)

    # Appends the index, so we can investigate anomalous rows and troubleshoot.
    indices.append(index)

# Creates the dataframe of extracted features using the feature lists.
features = {'index': indices, 'party_size':party_sizes,
            'party_avg_level':party_avg_levels,
            'monster_party_size':monster_sizes,
            'monster_avg_level':monster_avg_levels,
            'combat_length': combat_lengths}

combat_info = pd.DataFrame(features)

Processing files: 100%|██████████| 24748/24748 [24:30<00:00, 16.83it/s]  


       index  party_size  party_avg_level  monster_party_size  monster_avg_level  combat_length
0          1           1        10.000000                   1           4.000000              1
1          2           1        10.000000                   1           0.000000              0
2          3           1        10.000000                  11           0.136364              1
3          9           2         9.000000                   1           5.000000              0
4         13           1         6.000000                   4           0.125000              4
5         14           1         5.000000                   3           0.833333              2
6         15           3         9.000000                   1           4.000000              4
7         18           9         7.555556                   5           1.200000              6
8         19           5         6.800000                  15           0.600000             13
9         21           9        11.33333

In [389]:
# Writes extracted info to a CSV.
combat_info.to_csv('C:\\Users\\alect\\Documents\\Python\\Fireball\\combat_info.csv')