In [7]:
import os
import json
import lief
import hashlib
from tqdm import tqdm  # Optional for progress bars
from feature_extractors.pe_feature import PEFeatureExtractor  # Import your feature extractor

def process_safe_executables(file_path, output_file):

    feature_extractor = PEFeatureExtractor()

    with open(output_file, 'a') as out_f:
        if os.path.isfile(file_path):
            files_to_process = [file_path]
        # Handle directory
        elif os.path.isdir(file_path):
            files_to_process = [
                os.path.join(file_path, f) 
                for f in os.listdir(file_path) 
                if f.lower().endswith(('.exe', '.dll'))
            ]
        else:
            raise ValueError(f"Path {file_path} is neither file nor directory")

        for file_path in files_to_process:
            try:
                with open(file_path, 'rb') as f:
                    bytez = f.read()

                # Extract features
                features = feature_extractor.raw_features(bytez)

                out_f.write(json.dumps(features) + '\n')

            except lief.bad_format:
                print(f"Skipping {file_path}: Invalid PE format")
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")

    print(f"Processed  files, saved to {output_file}")


process_safe_executables(
    file_path="C://Program Files (x86)//Steam//steamapps//common//Blasphemous 2//Blasphemous 2.exe",
    output_file="new_dataset_2351.jsonl",
)

Processed  files, saved to new_dataset_2351.jsonl
