In [2]:
import sys
!{sys.executable} -m pip install prettytable



In [3]:
'''
Copyright (c) Chet Hosmer 2019-2020

Permission is hereby granted, free of charge, to any person obtaining a copy of this software
and associated documentation files (the "Software"), to deal in the Software without restriction, 
including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 
subject to the following conditions:

Objective is to utilize a Nearest Neighbor ML Model to determine the general type of a file.  This
assumes that the model was trained with a wide range or known file types (based on their extension)

The initial FileType Categories are:

Images
Multimedia
Generic Binary
Executable
Documents
Code
Archive
Database
Certs

The initial set of features or characteristics include:

File Header (First 8 Bytes)
File Footer (Last 8 Bytes)
Distribution of binary vs non-binary values found in the first N bytes of file

'''

# Script Module Importing

# Python Standard Library Modules
import os           # Operating/Filesystem Module
import time         # Basic Time Module
import logging      # Script Logging
import struct       # Binary Structure
import sys          # System Specifics
import pickle       # Pickle Module
import re           # Regulary Expressions
from prettytable import PrettyTable  # pip install pretty table

# Import 3rd Party Modules

import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

# Script Constants

'''
Python does not support constants directly
however, by initializing variables here and
specifying them as UPPER_CASE you can make your
intent known
'''
# General Constants
SCRIPT_NAME    = "File Feature Predictions using ML Nearest Neighbors Model"
SCRIPT_VERSION = "Version 1.0"
SCRIPT_AUTHOR  = "Author: C. Hosmer, Python Forensics\n"
MIN_SIZE       = 8192
MODEL_NAME     = "FileCategoryModel.sav"


# End of Script Constants  

def LoadModel():
    # load the model from disk
    try:
        theModel = pickle.load(open(MODEL_NAME, 'rb'))
        return theModel 
    except Exception as err:
        sys.exit("\n\nError Loading the Model: "+str(err))

def progbar(curr, total, full_progbar):
    ''' Simple Console based Progress Bar '''
    frac = curr/total
    filled_progbar = round(frac*full_progbar)
    print('\r', '#'*filled_progbar + '-'*(full_progbar-filled_progbar), '[{:>7.2%}]'.format(frac), end='')

def GetSamples(path):
    # Extract the file header and store it as 
    # an unsigned long long integer
    
    try:
        fileSize = os.path.getsize(path)
        if fileSize < 256:
            return -1, -1
        
        with open(path, 'rb') as inFile:
            contents  = inFile.read(8)
            hdrValue = struct.unpack('>Q', contents)
            hdrInt = hdrValue[0]
            
            inFile.seek(fileSize-8)
            contents  = inFile.read(8)
            endValue = struct.unpack('>Q', contents)
            endInt = endValue[0]    
            
            return hdrInt, endInt
        
    except:
        return -1, -1

def GetFrequencyDist(fileName):
    ''' Calculate the Frequency of binary vs non-binary data '''
    try:
            with open(fileName, 'rb') as inFile:
                if os.path.getsize(fileName) > MIN_SIZE:
                    raw = inFile.read(MIN_SIZE)
                else:
                    raw = inFile.read()
                chars = "".join(map(chr, raw))
                total = len(chars)
                txt = re.sub(r"[^a-zA-Z0123456789!#$&%^*-_{}:;,./<>?+']",'@', chars)
                binaryValues = txt.count('@')
                dist = binaryValues/total
                return dist
            
    except Exception as err:
        return -1

# End of Script Functions



if __name__ == '__main__':
    # Main Script Starts Here
    
    # Print Basic Script Information
    print()
    print(SCRIPT_NAME)
    print(SCRIPT_VERSION)
    print(SCRIPT_AUTHOR) 
    
    # load the model from disk
    try:
        model = LoadModel()
        print("ML Model Loaded\n")
        print(model)
    except Exception as err:
        sys.exit("Script Aborted: "+str(err))
        
    # Prompt user for a Starting Path to Analyze
    myRoot = input("\nEnter Starting Path to predict file types: ")
    
    if os.path.isdir(myRoot):
        myRoot = os.path.abspath(myRoot)
    else:
        sys.exit("\n\nPath is not a directory ... Script Exit\n\n")

    # Setup the Prettytable output
    resultTable = PrettyTable(['Prediction', 'FilePath'])

    # Setup for Progress Display
    totalFiles     = 0
    filesProcessed = 0
    for root, dirs, files in os.walk(myRoot):
        totalFiles = totalFiles + len(files)

    # Use the os.walk method to walk the path from
    # root to bottom
    for root, dirs, files in os.walk(myRoot):
    
        # Walk the path from top to bottom.
        # For each file obtain extract key features
        
        for eachFile in files:
            
            filesProcessed += 1
            progbar(filesProcessed, totalFiles, 20)  
            
            try:
                # Get the absolute path of the target file
                baseName  = eachFile
                relativePath = os.path.join(root, eachFile)
                absPath = os.path.abspath(relativePath)
                
                # Obtain the header / footer of the file
                hdrInt, endInt = GetSamples(absPath)
                
                # Calculate the freqDistibution
                dist = GetFrequencyDist(absPath)
                
                # if any of these returns an error skip the file
                if hdrInt == -1 or endInt == -1 or dist == -1:
                    resultTable.add_row( ["Z-SKIPPED FILE", absPath] )  
                    continue                
                
                # make predictions
                prediction = model.predict([[hdrInt, endInt, dist]])
                #print(model.predict([[hdrInt, nameRank]]) )  
                resultTable.add_row( [prediction[0], absPath] )                    
                
            except Exception as err:
                continue
    
    # Once all the files are processed display pretty table results
    print("\n\n")
    resultTable.align = "l" 
    print(resultTable.get_string(sortby="Prediction"))
    
    print("\n\nScript Completed")
# End of Script Main

    



File Feature Predictions using ML Nearest Neighbors Model
Version 1.0
Author: C. Hosmer, Python Forensics

ML Model Loaded

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

Enter Starting Path to predict file types: c:\tst
 #################### [100.00%]


+------------+------------------------+
| Prediction | FilePath               |
+------------+------------------------+
| Image      | c:\tst\Turtle.jpg      |
| Image      | c:\tst\dscn0011.jpg    |
| Image      | c:\tst\kinderscout.jpg |
| Image      | c:\tst\zzz.jpg         |
+------------+------------------------+


Script Completed
