In [2]:
import sys
!{sys.executable} -m pip install prettytable

Collecting prettytable
  Downloading https://files.pythonhosted.org/packages/ef/30/4b0746848746ed5941f052479e7c23d2b56d174b82f4fd34a25e389831f5/prettytable-0.7.2.tar.bz2
Building wheels for collected packages: prettytable
  Building wheel for prettytable (setup.py): started
  Building wheel for prettytable (setup.py): finished with status 'done'
  Created wheel for prettytable: filename=prettytable-0.7.2-cp37-none-any.whl size=13706 sha256=c8666a27e5673a96b89f2deb145e051515139bdfca8c1917c8790793bcb19399
  Stored in directory: C:\Users\cdhsl\AppData\Local\pip\Cache\wheels\80\34\1c\3967380d9676d162cb59513bd9dc862d0584e045a162095606
Successfully built prettytable
Installing collected packages: prettytable
Successfully installed prettytable-0.7.2


In [1]:
'''
Copyright (c) Chet Hosmer 2019-2020

Permission is hereby granted, free of charge, to any person obtaining a copy of this software
and associated documentation files (the "Software"), to deal in the Software without restriction, 
including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 
subject to the following conditions:

Objective to create a K Nearest Neighbor model for file type categorization.
The model initially will be created using 3 features from the training set of
files.  File Header, File Footer and frequency distribution binary vs non binary
found in the first 8k bytes of the file (or less for small files) 

The initial Categories are:

Images
Multimedia
Generic Binary
Executable
Documents
Code
Archive
Database
Certs

'''

# Script Module Importing

# Python Standard Library Modules
import os           # Operating/Filesystem Module
import time         # Basic Time Module
import logging      # Script Logging
import struct       # Binary Structure
import sys          # System Specifics
import pickle       # Pickle Module
import re           # regular expressions

# Import 3rd Party Modules

import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

# Script Constants

'''
Python does not support constants directly
however, by initializing variables here and
specifying them as UPPER_CASE you can make your
intent known
'''
# General Constants
SCRIPT_NAME    = "File Feature Model Creation ... Using Nearest Neighbor Method"
SCRIPT_VERSION = "Version 1.1"
SCRIPT_AUTHOR  = "Author: C. Hosmer, Python Forensics"
MODEL_NAME     = "FileCategoryModel.sav"

INSTRUCTIONS = '''

To build an ML model of file categories
enter path to a folder that provides
a baseline of known good files that
all have valid extensions that define
their type. This will create a ML
Nearest Neighbor Model of the 
samples provided. A simple
supervised method of creating a
model.

'''
MIN_SIZE = 8192  # Minimum size of frequency generation

# End of Script Constants

def progbar(curr, total, full_progbar):
    ''' Simple Console based Progress Bar '''
    frac = curr/total
    filled_progbar = round(frac*full_progbar)
    print('\r', '#'*filled_progbar + '-'*(full_progbar-filled_progbar), '[{:>7.2%}]'.format(frac), end='')

def SaveModel(theModel):
    """
    Save the Nearest Neighbor Model
    """
    # save the model to disk
    pickle.dump(theModel, open(MODEL_NAME, 'wb'))     

def GetFrequencyDist(fileName):
    ''' Calculate frequence distribution binary vs non binary '''
    try:
            with open(fileName, 'rb') as inFile:
                if os.path.getsize(fileName) > MIN_SIZE:
                    raw = inFile.read(MIN_SIZE)
                else:
                    raw = inFile.read()
                chars = "".join(map(chr, raw))
                total = len(chars)
                txt = re.sub(r"[^a-zA-Z0123456789!#$&%^*-_{}:;,./<>?+']",'@', chars)
                binaryValues = txt.count('@')
                dist = binaryValues/total
                return dist
            
    except Exception as err:
        return -1
    
def GetType(ext):
    
    # Classify the file based only on extension 
    # Make sure this is from a known good source
    if ext in imgList:
        FileType = "Image"
    elif ext in mmList:
        FileType = "Multimedia"
    elif ext in binList:
        FileType = "Binary"   
    elif ext in exeList:
        FileType = "Executable"  
    elif ext.lower() in docList:
        FileType = "Document"  
    elif ext in codeList:
        FileType = "Code"   
    elif ext in arcList:
        FileType = "Archive"    
    elif ext in dbList:
        FileType = "Database"     
    elif ext in certList:
        FileType = "Certs"        
    else:
        FileType = "Undefined"
        
    return FileType

def GetSamples(path):
    
    # Extract the file header and store it as 
    # an unsigned long long integer
    
    try:
        fileSize = os.path.getsize(path)
        if fileSize < 256:
            return -1, -1
        
        with open(path, 'rb') as inFile:
            contents  = inFile.read(8)
            hdrValue = struct.unpack('>Q', contents)
            hdrInt = hdrValue[0]
            
            inFile.seek(fileSize-8)
            contents  = inFile.read(8)
            endValue = struct.unpack('>Q', contents)
            endInt = endValue[0]    
            
            return hdrInt, endInt
        
    except:
        return -1, -1

# End of Script Functions

# Script Classes
'''
If you script will contain classes then insert them
here, before the execution of the main script.  This
will ensure that the functions will be accessible from
anywhere in your script
'''

# End of Script Classes


# Main Script Starts Here

if __name__ == '__main__':
    
    # Current File Type Extenstion List
    imgList  = [".jpg", ".jpeg", ".bmp", ".cur", ".wmf", ".gif", ".png", ".ogg", ".ico", ".ttf", ".svg", ".cmap", "xttf", "xeps", ".mac", ".psd", ".plt", ".tif", ".tiff", ".clips"]
    mmList   = [".mp4", ".mp3", ".avi", ".wav", ".swf", ".xmc", ".aif", ".aiff", "au", ".mov", ".qt", ".mid", ".midi", ".stx", ".lng"]
    docList  = [".xls", ".xlsx", ".doc", ".docx", ".dotx", ".xsl", ".xsd", ".cat", ".xslt", ".log", ".man", ".inx", ".micl", ".rtf", ".json", ".ppt", ".pptx",".csv", ".txt", ".url", ".pdf", ".p65", ".dat", ".htm", ".nse", ".lst", ".template", ".wk3", ".wks", ".wpd", ".wp5", ".mib", ".oid", ".config", ".conf", ".ini", ".inf", "stx", ".cfg", ".properties"]
    exeList  = [".exe", ".dll", ".bat", ".mui", ".msi", ".mo", ".crx", ".rll", ".sh", ".rsc", ".cmd", ".mst"]
    binList  = [".bin", ".raw", ".mem", ".pf", ".pma"]
    codeList = [".py", ".pyo", ".pyi", ".ps1", ".pi", ".pyc", ".pxd", ".pyd", ".qml", ".qmltypes", ".c", ".api", ".tlb", ".so", ".scheme", ".lua", ".c#", ".cpp", ".c++", ".class", ".perl", ".pl", ".php", ".pas", ".vb", ".java", ".js", ".jar", ".jsp", ".html", ".css", ".rb", ".xml", ".qm"]
    arcList  = [".zip", ".ar", ".tar", ".chm", ".arc", ".arj", ".cab", ".egg", ".whl", ".dmg", ".lha", ".gzip", ".rar", ".pak", ".sit", ".xpi", ".diz"]
    dbList   = [".db", ".db", ".dbf", ".mdb", ".dmd", ".sql", ".sqld"]
    certList = [".xrm-ms"]
    
    # Print Basic Script Information
    print()
    print(SCRIPT_NAME)
    print(SCRIPT_VERSION)
    print(SCRIPT_AUTHOR) 
    print(INSTRUCTIONS)
    
    # Prompt user for Known Good Starting Path
    myRoot = input("\nEnter Starting Path - Known Good File Extesions: ")
    
    if os.path.isdir(myRoot):
        myRoot = os.path.abspath(myRoot)
    else:
        sys.exit("\n\nPath is not a directory ... Script Exit\n\n")

    # Use the os.walk method to walk the path from
    # root to bottom

    featureList = []
    target      = []
    
    totalFiles = 0
    for root, dirs, files in os.walk(myRoot):
        totalFiles = totalFiles + len(files)
        
    filesProcessed  = 0
    for root, dirs, files in os.walk(myRoot):
    
        # Walk the path from top to bottom.
        # For each file obtain extract key features
        
        for eachFile in files:
            filesProcessed += 1
            progbar(filesProcessed, totalFiles, 20)              
            try:

                baseName  = eachFile                
                    
                ext = os.path.splitext(baseName)[1]
                fileType = GetType(ext.lower())
                
                if fileType == "Undefined":
                    continue
                    
                relativePath = os.path.join(root, eachFile)
                absPath      = os.path.abspath(relativePath)
                
                hdrInt, endInt = GetSamples(absPath)
                
                dist = GetFrequencyDist(absPath)
                
                if hdrInt == -1 or endInt == -1 or dist == -1:
                    continue
                
                featureList.append([hdrInt, endInt, dist])
                target.append(fileType)
                
            except Exception as err:
                continue
    
    features = np.array(featureList)
    print()
    print(features)
    print(target)

    # fit a k-nearest neighbor model to the data
    K = 3
    model = KNeighborsClassifier(n_neighbors = K)
    model.fit(features, target)
    print(model)

    SaveModel(model)
    
print("\n\nScript Completed")
# End of Script Main

    



File Feature Model Creation ... Using Nearest Neighbor Method
Version 1.1
Author: C. Hosmer, Python Forensics


To build an ML model of file categories
enter path to a folder that provides
a baseline of known good files that
all have valid extensions that define
their type. This will create a ML
Nearest Neighbor Model of the 
samples provided. A simple
supervised method of creating a
model.



Enter Starting Path - Known Good File Extesions: c:\tst
 #################### [100.00%]
[[1.84357664e+19 1.38185300e+19 6.83471680e-01]
 [1.84357664e+19 1.15232980e+19 7.11181641e-01]
 [1.84357664e+19 1.16697809e+19 7.01782227e-01]
 [1.84357664e+19 4.90770163e+18 6.58545521e-01]]
['Image', 'Image', 'Image', 'Image']
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')


Script Completed
