In [None]:
import json
import pandas as pd
from pathlib import Path

import json
import pandas as pd
from pathlib import Path

def stream_jsonl_to_parquet(jsonl_path, output_folder, batch_size=10000):
    buffer = []
    batch_num = 0

    output_folder = Path(output_folder)
    output_folder.mkdir(parents=True, exist_ok=True)

    print(f"Processing {jsonl_path}...")

    with open(jsonl_path, "r") as f:
        for i, line in enumerate(f):
            try:
                obj = json.loads(line)
                if obj.get("label") in [0, 1]:
                    buffer.append(obj)

                if len(buffer) >= batch_size:
                    df = pd.DataFrame(buffer)
                    output_path = output_folder / f"batch_{batch_num}.parquet"
                    df.to_parquet(output_path, index=False, engine="pyarrow", compression='snappy')
                    print(f"  Saved batch {batch_num} with {len(df)} rows to {output_path}")
                    buffer.clear()
                    batch_num += 1

                    break

            except json.JSONDecodeError as e:
                print(f"  Skipping line {i} due to JSON error: {e}")

    # Save any leftover rows
    if buffer:
        df = pd.DataFrame(buffer)
        output_path = output_folder / f"batch_{batch_num}.parquet"
        df.to_parquet(output_path, index=False, engine="pyarrow", compression='snappy')
        print(f"  Saved final batch {batch_num} with {len(df)} rows to {output_path}")

    print(f"Finished processing {jsonl_path}")


# Open and process the JSONL file
input_file = "F://ember//train_features_1.jsonl"
output_dir = "F://ember//parquet_batches"

stream_jsonl_to_parquet(input_file, output_dir)




In [None]:
#df = pd.read_parquet("F://ember2018//parquet_batches_0//batch_0.parquet")
df = pd.read_parquet("F://ember//parquet_batches//batch_0.parquet")
row = df.iloc[0]

print(df.columns.tolist())
print(row["header"]["optional"].keys())

In [None]:
import matplotlib.pyplot as plt

df["label"].value_counts().plot(kind="bar", title="Malware vs Benign Distribution")
plt.xticks(ticks=[0, 1], labels=["Benign (0)", "Malware (1)"])
plt.ylabel("Count")
plt.show()

In [None]:
print(row["general"].keys())

In [None]:
input_file = "F://ember2018//train_features_1.jsonl"
output_dir = "F://ember2018//parquet_batches_1"

stream_jsonl_to_parquet(input_file, output_dir)

In [None]:
import matplotlib.pyplot as plt


df = pd.read_parquet("F://ember2018//parquet_batches_1//batch_0.parquet")
df["label"].value_counts().plot(kind="bar", title="Malware vs Benign Distribution")
plt.xticks(ticks=[0, 1], labels=["Benign (0)", "Malware (1)"])
plt.ylabel("Count")
plt.show()


In [None]:
import lief
import numpy as np
from feature_extractors.general_info import GeneralFileInfo
from feature_extractors.header_info import HeaderFileInfo
from feature_extractors.byte_entropy_histogram import ByteEntropyHistogram
from feature_extractors.imports import ImportsInfo
from feature_extractors.sections import SectionInfo


# Replace with your actual path to an .exe or .dll
#pe_path = "C://Users//Alisa//Desktop//Unity app//CatKyu!! v3//CatKyu!!.exe"
pe_path = "E://SteamLibrary//steamapps//common//Blasphemous 2//Blasphemous 2.exe"
# Read raw bytes
with open(pe_path, "rb") as f:
    bytez = f.read()

# Try to parse with LIEF
try:
    lief_binary = lief.PE.parse(list(bytez))  # turn bytez into a list for LIEF
except Exception as e:
    print(f"Failed to parse with LIEF: {e}")
    lief_binary = None


# Initialize your feature extractor
feat_extractor = GeneralFileInfo()
head_extractor = HeaderFileInfo()
entro = ByteEntropyHistogram()
imports_extractor = ImportsInfo()
sections_extractor = SectionInfo()

# Get raw and processed features
raw = feat_extractor.raw_features(bytez, lief_binary)
vector = feat_extractor.process_raw_features(raw)

# Print results
print("Raw features:")
print(raw)

print("\nProcessed vector:")
print(vector)
print("Shape:", vector.shape)


raw = head_extractor.raw_features(bytez, lief_binary)
vector = head_extractor.process_raw_features(raw)

print("Raw features:")
print(raw)

print("\nProcessed vector:")
print(vector)
print("Shape:", vector.shape)

raw = entro.raw_features(bytez, lief_binary)
vector = entro.process_raw_features(raw)

print("Raw features:")
print(raw)

print("\nProcessed vector:")
print(vector)
print("Shape:", vector.shape)

raw = imports_extractor.raw_features(bytez, lief_binary)
vector = imports_extractor.process_raw_features(raw)

print("Raw features:")
print(raw)

print("\nProcessed vector:")
print(vector)
print("Shape:", vector.shape)

raw = sections_extractor.raw_features(bytez, lief_binary)
vector = sections_extractor.process_raw_features(raw)

print("Raw features:")
print(raw)

print("\nProcessed vector:")
print(vector)
print("Shape:", vector.shape)


In [1]:
from feature_extractors.pe_feature import PEFeatureExtractor


pe_path = "E://SteamLibrary//steamapps//common//Blasphemous 2//Blasphemous 2.exe"
 # Assuming your class is saved in this file

extractor = PEFeatureExtractor()

# Load a PE (.exe) file
with open(pe_path, "rb") as f:
    bytez = f.read()

# Extract the raw feature dict
raw = extractor.raw_features(bytez)

# Turn it into a flat vector (normalized)
vector = extractor.process_raw_features(raw)

print("Feature vector shape:", vector.shape)
print("SHA256 of file:", raw['sha256'])
print("First 10 values of vector:", vector[:100])


Feature vector shape: (2351,)
SHA256 of file: 5da46b251971f2ce199029648370a085e9f96c590d1383efc8b12365e6f5fcb0
First 10 values of vector: [0.23798606 0.04504423 0.03002949 0.02433224 0.02012315 0.01815932
 0.01617102 0.01413224 0.01381717 0.01223724 0.01147098 0.01051506
 0.00966009 0.00922114 0.00881277 0.00958515 0.00900854 0.00795474
 0.00724507 0.00740872 0.00726036 0.00705695 0.0061928  0.00595726
 0.00618362 0.00573549 0.00553207 0.00503805 0.00480404 0.00458533
 0.00423508 0.00406226 0.00530418 0.00374413 0.00374872 0.00352388
 0.00456392 0.00317975 0.00287233 0.00278821 0.00302069 0.00250679
 0.00230796 0.0024242  0.00222537 0.00284633 0.00224066 0.00203266
 0.00238902 0.00192407 0.00185677 0.00226666 0.00183689 0.001687
 0.00168088 0.00153864 0.00190418 0.00167323 0.00164876 0.00208925
 0.00159217 0.00156311 0.00187359 0.00216878 0.00305281 0.00212595
 0.00135816 0.00142546 0.001843   0.00165335 0.00121286 0.0012404
 0.00660575 0.0018583  0.00109968 0.00120522 0.0019623  0.001

In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from feature_extractors.pe_feature import PEFeatureExtractor
df = pd.read_parquet("F://ember//parquet_batches//batch_0.parquet")

    # Create the extractor
extractor = PEFeatureExtractor()

feature_vectors = []
vectors = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    try:
        # row is a Series (like a dict), compatible with extractor
        vector = extractor.process_raw_features(row)
        vectors.append(vector)
    except Exception as e:
        print(f"Error processing row: {e}")
        vectors.append(None)  # or skip

# Filter out any None values (optional)
vectors = [v for v in vectors if v is not None]
idk = np.array(vectors, dtype=np.float32)
print(idk.shape)

In [None]:
print(df.columns.tolist())
print(df.iloc[0])

In [1]:
import multiprocessing
import os
import tqdm 
import numpy as np
import pandas as pd
import json
from feature_extractors.pe_feature import PEFeatureExtractor


def raw_feature_iterator(file_paths):
    """
    Yield raw feature strings from the inputed file paths
    """
    for path in file_paths:
        with open(path, "r") as fin:
            for line in fin:
                try:
                    obj = json.loads(line)
                    if obj.get("label") in (0, 1):
                        yield line
                except Exception as e:
                    print(f"Skipping invalid line: {e}")

def count_filtered_lines(file_paths):
    count = 0
    for _ in raw_feature_iterator(file_paths):
        count += 1
    return count

def write_filtered_features(raw_feature_paths, output_path):
    with open(output_path, "w") as fout:
        for fp in raw_feature_paths:
            with open(fp, "r") as fin:
                for line in fin:
                    try:
                        obj = json.loads(line)
                        if obj.get("label") in (0, 1):
                            fout.write(line)
                    except Exception as e:
                        print(f"Skipping invalid line: {e}")


def vectorize(irow, raw_features_string, X_path, y_path, extractor, nrows):
    """
    Vectorize a single sample of raw features and write to a large numpy file
    """
    raw_features = json.loads(raw_features_string)
    feature_vector = extractor.process_raw_features(raw_features)

    y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=nrows)
    y[irow] = raw_features["label"]

    X = np.memmap(X_path, dtype=np.float32, mode="r+", shape=(nrows, extractor.dim))
    X[irow] = feature_vector





def vectorize_unpack(args):
    """
    Pass through function for unpacking vectorize arguments
    """
    return vectorize(*args)

def vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows):
    """
    Vectorize a subset of data and write it to disk
    """
    # Create space on disk to write features to
    X = np.memmap(X_path, dtype=np.float32, mode="w+", shape=(nrows, extractor.dim))
    y = np.memmap(y_path, dtype=np.float32, mode="w+", shape=nrows)
    del X, y

    # Distribute the vectorization work
    pool = multiprocessing.Pool()
    argument_iterator = ((irow, raw_features_string, X_path, y_path, extractor, nrows)
                         for irow, raw_features_string in enumerate(raw_feature_iterator(raw_feature_paths)))
    for args in tqdm.tqdm(argument_iterator, total=nrows):
        vectorize_unpack(args)




def create_vectorized_features(data_dir):
    """
    Create feature vectors from raw features and write them to disk
    """
    extractor = PEFeatureExtractor()
    raw_feature_paths = [os.path.join(data_dir, f"output.jsonl") for i in range(1)]
    filtered_path = os.path.join(data_dir, "filtered_train.jsonl")

    # Only run this once if needed
    write_filtered_features(raw_feature_paths, filtered_path)

    # Use the filtered file
    nrows = sum(1 for _ in open(filtered_path)) 
    print("Vectorizing training set")
    X_path = os.path.join(data_dir, "X.dat")
    y_path = os.path.join(data_dir, "y.dat")
    #raw_feature_paths = [os.path.join(data_dir, "output.jsonl".format(i)) for i in range(1)]
    #nrows = count_filtered_lines(raw_feature_paths)
    vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows)


def read_vectorized_features(data_dir, subset=None):
    """
    Read vectorized features into memory mapped numpy arrays
    """
    if subset is not None and subset not in ["train", "test"]:
        return None

    extractor = PEFeatureExtractor()
    ndim = extractor.dim
    X_train = None
    y_train = None
    X_test = None
    y_test = None

    if subset is None or subset == "train":
        X_train_path = os.path.join(data_dir, "X.dat")
        y_train_path = os.path.join(data_dir, "y.dat")
        y_train = np.memmap(y_train_path, dtype=np.float32, mode="r")
        N = y_train.shape[0]
        X_train = np.memmap(X_train_path, dtype=np.float32, mode="r", shape=(N, ndim))
        if subset == "train":
            return X_train, y_train

    
    return X_train, y_train


In [None]:
import feature_extractors
data_dir = "F://ember//idk"

feature_extractors.create_vectorized_features(data_dir)
X_train, y_train = feature_extractors.read_vectorized_features(data_dir)


Vectorizing training set


100%|██████████| 7019/7019 [00:09<00:00, 757.58it/s]


(7019, 2351)