Skip to content

hashdb 3.1.0 and bulk_extractor

Bruce Allen edited this page May 22, 2017 · 1 revision

bulk_extractor works with hashdb v3.0.0. For compatibility with hashdb v3.1.0, replace file bulk_extractor/src/scan_hashdb.cpp with this:

// Author:  Bruce Allen <bdallen@nps.edu>
// Created: 2/25/2013
//
// The software provided here is released by the Naval Postgraduate
// School, an agency of the U.S. Department of Navy.  The software
// bears no warranty, either expressed or implied. NPS does not assume
// legal liability nor responsibility for a User's use of the software
// or the results of such use.
//
// Please note that within the United States, copyright protection,
// under Section 105 of the United States Code, Title 17, is not
// available for any work of the United States Government and/or for
// any works created by United States Government employees. User
// acknowledges that this software contains work which was created by
// NPS government employees and is therefore in the public domain and
// not subject to copyright.
//
// Released into the public domain on February 25, 2013 by Bruce Allen.

/**
 * \file
 * Generates MD5 hash values from hashdb_block_size data taken along sector
 * boundaries and scans for matches against a hash database.
 *
 * Note that the hash database may be accessed locally through the
 * file system or remotely through a socket.
 */

#include "config.h"
#include "bulk_extractor.h"

#ifdef HAVE_HASHDB

//#define DEBUG_V2_OUT

#include "hashdb.hpp"
#include <dfxml/src/hash_t.h>

#include <iostream>
#include <cmath>
#include <unistd.h>	// for getpid
#include <sys/types.h>	// for getpid

// user settings
static std::string hashdb_mode="none";                                 // import or scan
static uint32_t hashdb_block_size=512;                                 // import or scan
static uint32_t hashdb_step_size=512;                                  // import or scan
static std::string hashdb_scan_path="your_hashdb_directory";           // scan only
static std::string hashdb_repository_name="default_repository";        // import only
static uint32_t hashdb_max_feature_file_lines=0;                       // scan only for feature file

// runtime modes
// scanner mode
enum mode_type_t {MODE_NONE, MODE_SCAN, MODE_IMPORT};
static mode_type_t mode = MODE_NONE;

// global state

// hashdb directory, import only
static std::string hashdb_dir;

// hash type
typedef md5_generator hash_generator;

// hashdb manager
static hashdb::import_manager_t* import_manager;
static hashdb::scan_manager_t* scan_manager;

static void do_import(const class scanner_params &sp,
                      const recursion_control_block &rcb);
static void do_scan(const class scanner_params &sp,
                    const recursion_control_block &rcb);


// safely hash sbuf range without overflow failure
inline const md5_t hash_one_block(const sbuf_t &sbuf)
{
    if (sbuf.bufsize >= hashdb_block_size) {
        // hash from the beginning
        return hash_generator::hash_buf(sbuf.buf, hashdb_block_size);
    }
    // hash the available part and zero-fill
    hash_generator g;
    g.update(sbuf.buf, sbuf.bufsize);

    // hash in extra zeros to fill out the block
    size_t extra = hashdb_block_size - sbuf.bufsize;
    std::vector<uint8_t> zeros(extra);
    g.update(&zeros[0], extra);
    return g.final();
}

// rules for determining if a block should be ignored
static bool ramp_trait(const sbuf_t &sbuf)
{
    if (sbuf.pagesize < 8) {
        // not enough to process
        return false;
    }

    uint32_t count = 0;
    for(size_t i=0;i<sbuf.pagesize-8;i+=4){
        // note that little endian is detected and big endian is not detected
        if (sbuf.get32u(i)+1 == sbuf.get32u(i+4)) {
            count += 1;
        }
    }
    return count > sbuf.pagesize/8;
}

static bool hist_trait(const sbuf_t &sbuf)
{
    if (sbuf.pagesize < hashdb_block_size) {
        // do not perform any histogram analysis on short blocks
        return false;
    }

    std::map<uint32_t,uint32_t> hist;
    for(size_t i=0;i<sbuf.pagesize-4;i+=4){
        hist[sbuf.get32uBE(i)] += 1;
    }
    if (hist.size() < 3) return true;
    for (std::map<uint32_t,uint32_t>::const_iterator it = hist.begin();it != hist.end(); it++){
        if ((it->second) > hashdb_block_size/16){
            return true;
        }
    }
    return false;
}

static bool whitespace_trait(const sbuf_t &sbuf)
{
    size_t count = 0;
    for(size_t i=0;i<sbuf.pagesize;i++){
        if (isspace(sbuf[i])) count+=1;
    }
    return count >= (sbuf.pagesize * 3)/4;
}

static bool monotonic_trait(const sbuf_t &sbuf)
{
    if (sbuf.pagesize < 16) {
        // not enough data
        return false;
    }

    const double total = sbuf.pagesize / 4.0;
    int increasing = 0, decreasing = 0, same = 0;
    for (size_t i=0; i+8<sbuf.pagesize; i+=4) {
        if (sbuf.get32u(i+4) > sbuf.get32u(i)) {
            increasing++;
        } else if (sbuf.get32u(i+4) < sbuf.get32u(i)) {
            decreasing++;
        } else {
            same++;
        }
    }
    if (increasing / total >= 0.75) return true;
    if (decreasing / total >= 0.75) return true;
    if (same / total >= 0.75) return true;
    return false;
}

// detect if block is all the same
inline bool empty_sbuf(const sbuf_t &sbuf)
{
    for (size_t i=1; i<sbuf.bufsize; i++) {
        if (sbuf[i] != sbuf[0]) {
            return false;
        }
    }
    return true;                        // all the same
}

extern "C"
void scan_hashdb(const class scanner_params &sp,
                 const recursion_control_block &rcb) {

    switch(sp.phase) {
        // startup
        case scanner_params::PHASE_STARTUP: {

            // set properties for this scanner
            std::string desc = "Search cryptographic hash IDs against hashes in a hashdb block hash database";
            desc += std::string(" (hashdb version") + std::string(hashdb_version()) + std::string(")");

            sp.info->name        = "hashdb";
            sp.info->author      = "Bruce Allen";
            sp.info->description = desc;
            sp.info->flags       = scanner_info::SCANNER_DISABLED;

            // hashdb_mode
            std::stringstream ss_hashdb_mode;
            ss_hashdb_mode << "Operational mode [none|import|scan]\n"
                << "        none    - The scanner is active but performs no action.\n"
                << "        import  - Import block hashes.\n"
                << "        scan    - Scan for matching block hashes.";
            sp.info->get_config("hashdb_mode", &hashdb_mode, ss_hashdb_mode.str());

            // hashdb_block_size
            sp.info->get_config("hashdb_block_size", &hashdb_block_size,
                         "Selects the block size to hash, in bytes.");

            // hashdb_step_size
            std::stringstream ss_hashdb_step_size;
            ss_hashdb_step_size
                << "Selects the step size.  Scans and imports along\n"
                << "      this step value.";
            sp.info->get_config("hashdb_step_size", &hashdb_step_size,
                                ss_hashdb_step_size.str());


            // hashdb_scan_path
            std::stringstream ss_hashdb_scan_path;
            ss_hashdb_scan_path
                << "File path to a hash database to scan against.\n"
                << "      Valid only in scan mode.";
            sp.info->get_config("hashdb_scan_path", &hashdb_scan_path,
                                ss_hashdb_scan_path.str());

            // hashdb_repository_name
            std::stringstream ss_hashdb_import_repository_name;
            ss_hashdb_import_repository_name
                << "Sets the repository name to\n"
                << "      attribute the import to.  Valid only in import mode.";
            sp.info->get_config("hashdb_repository_name",
                                &hashdb_repository_name,
                                ss_hashdb_import_repository_name.str());

            // configure the feature file to accept scan features
            // but only if in scan mode
            if (hashdb_mode == "scan") {
                sp.info->feature_names.insert("identified_blocks");
#ifdef DEBUG_V2_OUT
                sp.info->feature_names.insert("identified_blocks2");
#endif
            }

            // hashdb_max_feature_file_lines
            std::stringstream ss_hashdb_max_feature_file_lines;
            ss_hashdb_max_feature_file_lines
                << "The maximum number of features lines to record\n"
                << "      or 0 for no limit.  Valid only in scan mode.";
            sp.info->get_config("hashdb_max_feature_file_lines", &hashdb_max_feature_file_lines,
                                ss_hashdb_max_feature_file_lines.str());


            return;
        }

        // init
        case scanner_params::PHASE_INIT: {
            // validate the input parameters

            // hashdb_mode
            if (hashdb_mode == "none") {
                mode = MODE_NONE;
            } else if (hashdb_mode == "import") {
                mode = MODE_IMPORT;
            } else if (hashdb_mode == "scan") {
                mode = MODE_SCAN;
            } else {
                // bad mode
                std::cerr << "Error.  Parameter 'hashdb_mode' value '"
                          << hashdb_mode << "' must be [none|import|scan].\n"
                          << "Cannot continue.\n";
                exit(1);
            }

            // hashdb_block_size
            if (hashdb_block_size == 0) {
                std::cerr << "Error.  Value for parameter 'hashdb_block_size' is invalid.\n"
                          << "Cannot continue.\n";
                exit(1);
            }

            // hashdb_step_size
            if (hashdb_step_size == 0) {
                std::cerr << "Error.  Value for parameter 'hashdb_step_size' is invalid.\n"
                          << "Cannot continue.\n";
                exit(1);
            }

            // indicate hashdb version
            std::cout << "hashdb: hashdb_version=" << hashdb_version() << "\n";

            // perform setup based on mode
            switch(mode) {
                case MODE_IMPORT: {
                    // set the path to the hashdb
                    hashdb_dir = sp.fs.get_outdir() + "/" + "hashdb.hdb";

                    // show relevant settable options
                    std::cout << "hashdb: hashdb_mode=" << hashdb_mode << "\n"
                              << "hashdb: hashdb_block_size=" << hashdb_block_size << "\n"
                              << "hashdb: hashdb_step_size= " << hashdb_step_size << "\n"
                              << "hashdb: hashdb_repository_name= " << hashdb_repository_name << "\n"
                              << "hashdb: Creating hashdb directory " << hashdb_dir << "\n";

                    // open hashdb for importing
                    // currently, hashdb_dir is required to not exist
                    hashdb::settings_t settings;
                    settings.block_size = hashdb_block_size;
                    std::string error_message = hashdb::create_hashdb(hashdb_dir, settings, "");
                    if (error_message.size() != 0) {
                        std::cerr << "Error: " << error_message << "\n";
                        exit(1);
                    }
                    import_manager = new hashdb::import_manager_t(hashdb_dir, "");
                    return;
                }

                case MODE_SCAN: {
                    // show relevant settable options
                    std::cout << "hashdb: hashdb_mode=" << hashdb_mode << "\n"
                              << "hashdb: hashdb_block_size=" << hashdb_block_size << "\n"
                              << "hashdb: hashdb_step_size= " << hashdb_step_size << "\n"
                              << "hashdb: hashdb_scan_path=" << hashdb_scan_path << "\n"
                              << "hashdb: hashdb_max_feature_file_lines=" << hashdb_max_feature_file_lines
                              << "\n";

                    // open hashdb for scanning
                    scan_manager = new hashdb::scan_manager_t(hashdb_scan_path);

                    // set the feature recorder to leave context alone but fix invalid utf8
                    sp.fs.get_name("identified_blocks")->set_flag(feature_recorder::FLAG_XML);
#ifdef DEBUG_V2_OUT
                    sp.fs.get_name("identified_blocks2")->set_flag(feature_recorder::FLAG_XML);
#endif

                    return;
                }

                case MODE_NONE: {
                    // show relevant settable options
                    std::cout << "hashdb: hashdb_mode=" << hashdb_mode << "\n"
                              << "WARNING: the hashdb scanner is enabled but it will not perform any action\n"
                              << "because no mode has been selected.  Please either select a hashdb mode or\n"
                              << "leave the hashdb scanner disabled to avoid this warning.\n";

                    // no action
                    return;
                }
                    
                default: {
                    // program error
                    assert(0);
                }
            }
        }

        // scan
        case scanner_params::PHASE_SCAN: {
            switch(mode) {
                case MODE_IMPORT:
                    do_import(sp, rcb);
                     return;

                case MODE_SCAN:
                     do_scan(sp, rcb);
                     return;
                default:
                     // the user should have just left the scanner disabled.
                     // no action.
                     return;
            }
        }

        // shutdown
        case scanner_params::PHASE_SHUTDOWN: {
            switch(mode) {
                case MODE_IMPORT:
                    delete import_manager;
                    return;

                case MODE_SCAN:
                    delete scan_manager;
                    return;
                default:
                    // the user should have just left the scanner disabled.
                    // no action.
                    return;
            }
        }

        // there are no other bulk_extractor scanner state actions
        default: {
            // no action for other bulk_extractor scanner states
            return;
        }
    }
}

// perform import
static void do_import(const class scanner_params &sp,
                      const recursion_control_block &rcb) {

    // get the sbuf
    const sbuf_t& sbuf = sp.sbuf;

    // get the filename from sbuf without the sbuf map file delimiter
    std::string path_without_map_file_delimiter =
              (sbuf.pos0.path.size() > 4) ?
              std::string(sbuf.pos0.path, 0, sbuf.pos0.path.size() - 4) : "";
 
    // get the filename to use as the source filename
    std::stringstream ss;
    const size_t p=sbuf.pos0.path.find('/');
    if (p==std::string::npos) {
        // no directory in forensic path so explicitly include the filename
        ss << sp.fs.get_input_fname();
        if (sbuf.pos0.isRecursive()) {
            // forensic path is recursive so add "/" + forensic path
            ss << "/" << path_without_map_file_delimiter;
        }
    } else {
        // directory in forensic path so print forensic path as is
        ss << path_without_map_file_delimiter;
    }
    std::string source_filename = ss.str();

    // calculate the file hash using the sbuf page
    const md5_t sbuf_hash = hash_generator::hash_buf(sbuf.buf, sbuf.pagesize);
    const std::string file_binary_hash =
               std::string(reinterpret_cast<const char*>(sbuf_hash.digest), 16);

    // track count values
    size_t zero_count = 0;
    size_t nonprobative_count = 0;

    // import the cryptograph hash values from all the blocks in sbuf
    for (size_t offset=0; offset<sbuf.pagesize; offset+=hashdb_step_size) {

        // Create a child sbuf of what we would hash
        const sbuf_t sbuf_to_hash(sbuf,offset,hashdb_block_size);

        // ignore empty blocks
        if (empty_sbuf(sbuf_to_hash)){
            ++zero_count;
            continue;
        }

        // calculate the hash for this import-sector-aligned hash block
        const md5_t hash = hash_one_block(sbuf_to_hash);
        const std::string binary_hash(reinterpret_cast<const char*>(hash.digest), 16);

        // put together any block classification labels
        // set flags based on specific tests on the block
        // Construct an sbuf from the block and subject it to the other tests
        const sbuf_t s(sbuf, offset, hashdb_block_size);
        std::stringstream ss_flags;
        if (ramp_trait(s))       ss_flags << "R";
        if (hist_trait(s))       ss_flags << "H";
        if (whitespace_trait(s)) ss_flags << "W";
        if (monotonic_trait(s))  ss_flags << "M";

        // NOTE: shannon16 is Disabled because its results were not useful
        // and because it needs fixed to not generate sbuf read exception.
        //if (ss_flags.str().size() > 0) ss_flags << "," << shannon16(s);

        // flags means nonprobative
        if (ss_flags.str().size() > 0) {
            ++nonprobative_count;
        }

        // import the hash
        import_manager->insert_hash(binary_hash,
                                    0,    // entropy
                                    ss_flags.str(),
                                    file_binary_hash);
    }

    // insert the source name pair
    import_manager->insert_source_name(file_binary_hash,
                              hashdb_repository_name, source_filename);

    // insert the source data
    import_manager->insert_source_data(file_binary_hash,
                                       sbuf.pagesize,
                                       "", // file type
                                       zero_count,
                                       nonprobative_count);
}

// perform scan
static void do_scan(const class scanner_params &sp,
                    const recursion_control_block &rcb) {

    // get the feature recorder
    feature_recorder* identified_blocks_recorder = sp.fs.get_name("identified_blocks");
#ifdef DEBUG_V2_OUT
    feature_recorder* identified_blocks_recorder2 = sp.fs.get_name("identified_blocks2");
#endif

    // get the sbuf
    const sbuf_t& sbuf = sp.sbuf;

    // process cryptographic hash values for blocks along sector boundaries
    for (size_t offset=0; offset<sbuf.pagesize; offset+=hashdb_step_size) {

        // stop recording if feature file line count is at requested max
        if (hashdb_max_feature_file_lines > 0 && identified_blocks_recorder->count() >=
                                                   hashdb_max_feature_file_lines) {
            break;
        }

        // Create a child sbuf of the block
        const sbuf_t sbuf_to_hash(sbuf, offset, hashdb_block_size);

        // ignore empty blocks
        if (empty_sbuf(sbuf_to_hash)){
            continue;
        }

        // calculate the hash for this sector-aligned hash block
        const md5_t hash = hash_one_block(sbuf_to_hash);
        const std::string binary_hash =
               std::string(reinterpret_cast<const char*>(hash.digest), 16);

        // scan for the hash
        std::string json_text = scan_manager->find_hash_json(
                      hashdb::scan_mode_t::EXPANDED_OPTIMIZED, binary_hash);

        if (json_text.size() == 0) {
          // hash not found
          continue;
        }

        // prepare fields to record the feature

        // get hash_string from hash
        std::string hash_string = hash.hexdigest();

        // record the feature, there is no context field
        identified_blocks_recorder->write(sbuf.pos0+offset, hash_string, json_text);

#ifdef DEBUG_V2_OUT
        size_t count = scan_manager->find_hash_count(binary_hash);

        // build context field
        std::stringstream ss;
        ss << "{\"count\":" << count << "}";

        // record the feature
        identified_blocks_recorder2->write(sbuf.pos0+offset, hash_string, ss.str());
#endif

    }
}

#endif