In [33]:
import numpy as np
import pandas as pd
import pickle
import pefile
import os
import hashlib
import array
import math
import csv


model=pickle.load(open('model8.pkl','rb'))



Keras model archive loading:
File Name                                             Modified             Size
variables.h5                                   2023-04-17 14:31:04       484040
config.json                                    2023-04-17 14:31:04         3546
metadata.json                                  2023-04-17 14:31:04           64
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dense_4
.........vars
............0
............1
......dense_5
.........vars
............0
............1
......dense_6
.........vars
............0
............1
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.......

In [34]:
def get_entropy(data):
    if len(data) == 0:
        return 0.0
    occurences = array.array('L', [0]*256)
    for x in data:
        occurences[x if isinstance(x, int) else ord(x)] += 1

    entropy = 0
    for x in occurences:
        if x:
            p_x = float(x) / len(data)
            entropy -= p_x*math.log(p_x, 2)

    return entropy

def get_resources(pe):
    """Extract resources :
    [entropy, size]"""
    resources = []
    if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
        try:
            for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
                if hasattr(resource_type, 'directory'):
                    for resource_id in resource_type.directory.entries:
                        if hasattr(resource_id, 'directory'):
                            for resource_lang in resource_id.directory.entries:
                                data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
                                size = resource_lang.data.struct.Size
                                entropy = get_entropy(data)

                                resources.append([entropy, size])
        except Exception as e:
            return resources
    return resources

def get_version_info(pe):
    """Return version infos"""
    res = {}
    for fileinfo in pe.FileInfo:
        if fileinfo.Key == 'StringFileInfo':
            for st in fileinfo.StringTable:
                for entry in st.entries.items():
                    res[entry[0]] = entry[1]
        if fileinfo.Key == 'VarFileInfo':
            for var in fileinfo.Var:
                res[var.entry.items()[0][0]] = var.entry.items()[0][1]
    if hasattr(pe, 'VS_FIXEDFILEINFO'):
        res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
        res['os'] = pe.VS_FIXEDFILEINFO.FileOS
        res['type'] = pe.VS_FIXEDFILEINFO.FileType
        res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
        res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
        res['signature'] = pe.VS_FIXEDFILEINFO.Signature
        res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
    return res

In [35]:
def extract_features(file_path):
    
        # Load the PE file
        pe = pefile.PE(file_path)
        entropy = list(map(lambda x:x.get_entropy(), pe.sections))
        raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
        virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
        
        # Extract features from the PE file
        features = {
            
            "Machine": pe.FILE_HEADER.Machine,
            "SizeOfOptionalHeader": pe.FILE_HEADER.SizeOfOptionalHeader,
            "Characteristics": pe.FILE_HEADER.Characteristics,
            "MajorLinkerVersion": pe.OPTIONAL_HEADER.MajorLinkerVersion,
            "MinorLinkerVersion": pe.OPTIONAL_HEADER.MinorLinkerVersion,
            "SizeOfCode": pe.OPTIONAL_HEADER.SizeOfCode,
            "SizeOfInitializedData": pe.OPTIONAL_HEADER.SizeOfInitializedData,
            "SizeOfUninitializedData": pe.OPTIONAL_HEADER.SizeOfUninitializedData,
            "AddressOfEntryPoint": pe.OPTIONAL_HEADER.AddressOfEntryPoint,
            "BaseOfCode": pe.OPTIONAL_HEADER.BaseOfCode,
            "BaseOfData": pe.OPTIONAL_HEADER.BaseOfData,
            "ImageBase":pe.OPTIONAL_HEADER.ImageBase,
            "SectionAlignment":pe.OPTIONAL_HEADER.SectionAlignment,
            "FileAlignment":pe.OPTIONAL_HEADER.FileAlignment,
            "MajorOperatingSystemVersion":pe.OPTIONAL_HEADER.MajorOperatingSystemVersion,
            "MinorOperatingSystemVersion":pe.OPTIONAL_HEADER.MinorOperatingSystemVersion,
            "MajorImageVersion":pe.OPTIONAL_HEADER.MajorImageVersion,
            "MinorImageVersion":pe.OPTIONAL_HEADER.MinorImageVersion,
            "MajorSubsystemVersion":pe.OPTIONAL_HEADER.MajorSubsystemVersion,
            "MinorSubsystemVersion":pe.OPTIONAL_HEADER.MinorSubsystemVersion,
            "SizeOfImage":pe.OPTIONAL_HEADER.SizeOfImage,
            "SizeOfHeaders": pe.OPTIONAL_HEADER.SizeOfHeaders,
            "CheckSum": pe.OPTIONAL_HEADER.CheckSum,
            "Subsystem": pe.OPTIONAL_HEADER.Subsystem,
            "DllCharacteristics": pe.OPTIONAL_HEADER.DllCharacteristics,
            "SizeOfStackReserve": pe.OPTIONAL_HEADER.SizeOfStackReserve,
            "SizeOfStackCommit": pe.OPTIONAL_HEADER.SizeOfStackCommit,
            "SizeOfHeapReserve": pe.OPTIONAL_HEADER.SizeOfHeapReserve,
            "SizeOfHeapCommit": pe.OPTIONAL_HEADER.SizeOfHeapCommit,
            "LoaderFlags": pe.OPTIONAL_HEADER.LoaderFlags,
            "NumberOfRvaAndSizes": pe.OPTIONAL_HEADER.NumberOfRvaAndSizes,
            "SectionsNb":len(pe.sections)
        }
        
        try:
            features.update({"SectionsMeanEntropy":float(sum(entropy))/len(entropy)})
            
        except:
            features.update({"SectionsMeanEntropy":0})
            
        features.update({
            "SectionsMinEntropy":min(entropy),
            "SectionsMaxEntropy":max(entropy)
        })
        
        try:
            features.update({"SectionsMeanRawsize":float(sum(raw_sizes))/len(raw_sizes)})
            
        except:
            features.update({"SectionsMeanRawsize":0})
            
        features.update({
            "SectionsMinRawsize":min(raw_sizes),
            "SectionMaxRawsize":max(raw_sizes)
        })
        
        try:
            features.update({"SectionsMeanVirtualsize": float(sum(virtual_sizes))/len(virtual_sizes)})
        except:
            features.update({"SectionsMeanVirtualsize":0})
            
        features.update({
            "SectionsMinVirtualsize":min(virtual_sizes),
            "SectionMaxVirtualsize":max(virtual_sizes)
        })
        
        
    #Imports
        try:
            imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])
        
            features.update({
                "ImportsNbDLL":len(pe.DIRECTORY_ENTRY_IMPORT),
                "ImportsNb":len(imports),
                "ImportsNbOrdinal":len(list(filter(lambda x:x.name is None, imports)))
            })
        except AttributeError:
            features.update({
                "ImportsNbDLL":0,
                "ImportsNb":0,
                "ImportsNbOrdinal":0   
            })
        #Exports
        try:
            features.update({"ExportNb":len(pe.DIRECTORY_ENTRY_EXPORT.symbols)})
        except AttributeError:
            # No export
            features.update({"ExportNb":0})
            
        #Resources
        resources= get_resources(pe)
        features.update({"ResourcesNb":len(resources)})
        if len(resources)> 0:
            entropy = list(map(lambda x:x[0], resources))
            features.update({
                "ResourcesMeanEntropy":0 if len(entropy)==0 else float(sum(entropy))/len(entropy),
                "ResourcesMinEntropy":min(entropy),
                "ResourcesMaxEntropy":max(entropy)
            })
            sizes = list(map(lambda x:x[1], resources))
            features.update({
                "ResourcesMeanSize":0 if len(sizes)==0 else float(sum(sizes))/len(sizes),
                "ResourcesMinSize":min(sizes),
                "ResourcesMaxSize":max(sizes)
                })
        else:
            features.update({"ResourcesMeanEntropy":0,
                             "ResourcesMinEntropy":0,
                            "ResourcesMaxEntropy:":0,
                            "ResourcesMeanSize":0,
                            "ResourcesMinSize":0,
                            "ResourcesMaxSize":0
                            })
            
         # Load configuration size
        try:
            features.update({"LoadConfigurationSize":pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size})
        except AttributeError:
            features.update({'LoadConfigurationSize':0})
                            
                            
         # Version configuration size
        try:
            version_infos = get_version_info(pe)
            features.update({'VersionInformationSize':len(version_infos.keys())})
        except AttributeError:
            features.update({'VersionInformationSize':0})

        return features
    
   

    

In [36]:
# Define the directory containing the PE file to be analyzed
dir_path = r"/home/vishwanath/Desktop/fyp"

# Define the output CSV file name and path
output_file = "pe_fea.csv"

# Extract features from the PE file and write them to the output CSV file
file_path = os.path.join(dir_path, "7z2201-x64.exe")
features = extract_features(file_path)

with open(output_file, mode='w', newline='') as csv_file:
    fieldnames = ['Machine','SizeOfOptionalHeader','Characteristics','MajorLinkerVersion','MinorLinkerVersion','SizeOfCode','SizeOfInitializedData','SizeOfUninitializedData','AddressOfEntryPoint','BaseOfCode','BaseOfData','ImageBase','SectionAlignment','FileAlignment','MajorOperatingSystemVersion','MinorOperatingSystemVersion','MajorImageVersion','MinorImageVersion','MajorSubsystemVersion','MinorSubsystemVersion','SizeOfImage','SizeOfHeaders','CheckSum','Subsystem','DllCharacteristics','SizeOfStackReserve','SizeOfStackCommit','SizeOfHeapReserve','SizeOfHeapCommit','LoaderFlags','NumberOfRvaAndSizes','SectionsNb','SectionsMeanEntropy','SectionsMinEntropy','SectionsMaxEntropy','SectionsMeanRawsize','SectionsMinRawsize','SectionMaxRawsize','SectionsMeanVirtualsize','SectionsMinVirtualsize','SectionMaxVirtualsize','ImportsNbDLL','ImportsNb','ImportsNbOrdinal','ExportNb','ResourcesNb','ResourcesMeanEntropy','ResourcesMinEntropy','ResourcesMaxEntropy','ResourcesMeanSize','ResourcesMinSize','ResourcesMaxSize','LoadConfigurationSize','VersionInformationSize']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    if features:
        writer.writerow(features)
        print(f"Features extracted from {file_path} and written to {output_file}")
    else:
        print(f"Features not extracted from {file_path}")

Features extracted from /home/vishwanath/Desktop/fyp/7z2201-x64.exe and written to pe_fea.csv


In [37]:
data=pd.read_csv('pe_fea.csv')

In [38]:
z=data.values
z

array([[3.32000000e+02, 2.24000000e+02, 3.03000000e+02, 6.00000000e+00,
        0.00000000e+00, 2.66240000e+04, 1.99680000e+04, 0.00000000e+00,
        3.01000000e+04, 4.09600000e+03, 3.27680000e+04, 4.19430400e+06,
        4.09600000e+03, 5.12000000e+02, 4.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 4.00000000e+00, 0.00000000e+00,
        5.73440000e+04, 1.02400000e+03, 0.00000000e+00, 2.00000000e+00,
        2.56000000e+02, 1.04857600e+06, 4.09600000e+03, 1.04857600e+06,
        4.09600000e+03, 0.00000000e+00, 1.60000000e+01, 4.00000000e+00,
        3.91927550e+00, 2.03931352e-02, 6.59275759e+00, 9.08800000e+03,
        5.12000000e+02, 2.66240000e+04, 1.14880000e+04, 4.07200000e+03,
        2.64120000e+04, 6.00000000e+00, 7.70000000e+01, 0.00000000e+00,
        0.00000000e+00, 6.00000000e+00, 2.90770595e+00, 1.39918346e+00,
        5.38456361e+00, 6.04333333e+02, 3.40000000e+01, 1.45800000e+03,
        0.00000000e+00, 0.00000000e+00]])

In [39]:
res=model.predict(z)



In [40]:
res

array([[0., 1.]], dtype=float32)