In [1]:
# Data processing
import pandas as pd
import numpy as np
import functools

# Set pandas to show all columns in the dataframes
pd.set_option("display.max_columns", None)

In [2]:
cols = ["Name", "Value", "Start", "Size"]
data = pd.read_csv("outputs/SAVEDATA1000.csv", dtype=str, usecols=cols)
data

Unnamed: 0,Name,Value,Start,Size
0,struct mhw_ib_save mhw_save,,0h,AC30E0h
1,struct mhw_save_header header,,0h,40h
2,u32 magic,1,0h,4h
3,u32 unknown0,538379545,4h,4h
4,u32 unknown1,32,8h,4h
...,...,...,...,...
4687083,u64 zero[215540],0,AC30B8h,8h
4687084,u64 zero[215541],0,AC30C0h,8h
4687085,u64 zero[215542],0,AC30C8h,8h
4687086,u64 zero[215543],0,AC30D0h,8h


In [3]:
def hex2int(x : str, default = -1):
    return int(str(x)[0:-1], 16)
def int2hex(x : int):
    return f"{x:X}h"


data["Start"] = data["Start"].apply(hex2int)
data["Size"] = data["Size"].apply(hex2int)
data

Unnamed: 0,Name,Value,Start,Size
0,struct mhw_ib_save mhw_save,,0,11284704
1,struct mhw_save_header header,,0,64
2,u32 magic,1,0,4
3,u32 unknown0,538379545,4,4
4,u32 unknown1,32,8,4
...,...,...,...,...
4687083,u64 zero[215540],0,11284664,8
4687084,u64 zero[215541],0,11284672,8
4687085,u64 zero[215542],0,11284680,8
4687086,u64 zero[215543],0,11284688,8


In [4]:
# Remove overarching structures 
data["Remove"] = data["Start"] + data["Size"] > data["Start"].shift(-1)
data.drop(data[data["Remove"]].index, inplace=True)
data.drop(["Remove"], axis=1, inplace=True)

In [5]:
def intdef(x : str, default = None):
    try: return int(x)
    except: return default

# Get whether the structure is unknown or known as zero
data["Unknown"] = data["Name"].str.contains(r"unknown|data\[")
data["Zero"] = data["Name"].str.contains("zero")
unknown_fill=data["Unknown"].iloc[-1]

# Get all blocks of consecutive values
data["Value"] = data["Value"].apply(intdef).astype("float")
data["Group"] = (data["Unknown"] == data["Unknown"].shift(1, fill_value=unknown_fill)
                 & (data["Value"].diff(1) != 0)).cumsum()
data["Consecutive"] = data.groupby("Group")["Group"].transform("size")

# Get sub-dataframes, to work with
unknown = data[data["Unknown"]]
zero = data[data["Zero"]]
data

Unnamed: 0,Name,Value,Start,Size,Unknown,Zero,Group,Consecutive
2,u32 magic,1.0,0,4,False,False,1,2
3,u32 unknown0,538379545.0,4,4,True,False,1,2
4,u32 unknown1,32.0,8,4,True,False,2,2
6,u8 hash[0],9.0,12,1,False,False,2,2
7,u8 hash[1],195.0,13,1,False,False,3,1
...,...,...,...,...,...,...,...,...
4687083,u64 zero[215540],0.0,11284664,8,False,True,3502117,1
4687084,u64 zero[215541],0.0,11284672,8,False,True,3502118,1
4687085,u64 zero[215542],0.0,11284680,8,False,True,3502119,1
4687086,u64 zero[215543],0.0,11284688,8,False,True,3502120,1


In [6]:
# Check for consecutive unknown values
min_consecutive = 10000

unknown_groups = pd.DataFrame(
    np.array([
       [v["Group"].iloc[0], v["Value"].iloc[0], v["Start"].iloc[0], v["Start"].iloc[-1] + v["Size"].iloc[-1], v["Consecutive"].iloc[0]]
       for k, v in unknown[unknown["Consecutive"] >= min_consecutive].groupby("Group")
    ]),
    columns=["Group", "Value", "Start", "End", "Consecutive"],
    dtype=int
)
unknown_groups["Length"] = unknown_groups["End"] - unknown_groups["Start"]
unknown_groups["Start"] = unknown_groups["Start"].apply(int2hex)
unknown_groups["End"] = unknown_groups["End"].apply(int2hex)

print(unknown_groups.shape)
unknown_groups

(6, 6)


Unnamed: 0,Group,Value,Start,End,Consecutive,Length
0,1381413,0,4EC46Bh,4F39F7h,30085,30092
1,1381415,0,4F3B87h,5072FAh,79731,79731
2,2333710,0,6F5F2Bh,6FD4B7h,30085,30092
3,2333712,0,6FD647h,710DBAh,79731,79731
4,3285556,0,8FF9EBh,906F77h,30085,30092
5,3285558,0,907107h,91A87Ah,79731,79731


In [7]:
unknown_size = unknown["Size"].sum()
zero_size = zero["Size"].sum()
data_size = data["Size"].sum()
total_mapped_size = data_size - unknown_size;
nonzero_mapped_size = data_size - zero_size - unknown_size
print(f"Unknown: {unknown_size}/{data_size} = {unknown_size/data_size:.8%}")
print(f"Zero: {zero_size}/{data_size} = {zero_size/data_size:.8%}")
print(f"Nonzero Mapped: {nonzero_mapped_size}/{data_size} = {nonzero_mapped_size/data_size:.8%}")
print(f"Total Mapped: {total_mapped_size}/{data_size} = {total_mapped_size/data_size:.8%}")

Unknown: 1250523/11284704 = 11.08157556%
Zero: 4845890/11284704 = 42.94210996%
Nonzero Mapped: 5188291/11284704 = 45.97631449%
Total Mapped: 10034181/11284704 = 88.91842444%
