# Comparison of TPR and FPR among three packer detection tools (PyPackerDetect, Manalyze, and pypeid)

In [1]:
from utils import *
from sklearn.metrics import confusion_matrix

In [2]:
manalyze_result_packed = load_nested_json("manalyze_result_packingdata_packed.jsonl", process_manalyze_result)
pypacker_result_packed = load_nested_json("pypacker_result_packingdata_packed.jsonl", process_pypacker)
pypeid_result_packed = load_nested_json("pypeid_result_packingdata_packed.jsonl", process_peid)

In [3]:
df_packed = pd.merge(pd.merge(manalyze_result_packed, pypacker_result_packed, on="name"), pypeid_result_packed, on="name")

In [4]:
manalyze_result_nonpacked = load_nested_json("manalyze_result_packingdata_nonpacked.jsonl", process_manalyze_result)
pypacker_result_nonpacked = load_nested_json("pypacker_result_packingdata_nonpacked.jsonl", process_pypacker)
pypeid_result_nonpacked = load_nested_json("pypeid_result_packingdata_nonpacked.jsonl", process_peid)

In [5]:
df_nonpacked = pd.merge(pd.merge(manalyze_result_nonpacked, pypacker_result_nonpacked, on="name"), pypeid_result_nonpacked, on="name")
df_nonpacked.head()

Unnamed: 0,name,manalyze_result_level,manalyze_result_output,manalyze_result_summary,pypacker_suspicions,pypacker_detections,peid_packed,peid_PEiD
0,/home/ffri/WorkDir/FFRIDataset/PackerDetectorC...,,,,[],[],False,"[Armadillo_v171, Microsoft_Visual_Cpp_v60, Mic..."
1,/home/ffri/WorkDir/FFRIDataset/PackerDetectorC...,,,,[],[],False,"[Armadillo_v171, Microsoft_Visual_Cpp_v60, Mic..."
2,/home/ffri/WorkDir/FFRIDataset/PackerDetectorC...,,,,[],[],False,"[Armadillo_v171, Microsoft_Visual_Cpp_v60, Mic..."
3,/home/ffri/WorkDir/FFRIDataset/PackerDetectorC...,,,,[],[],False,"[VC8_Microsoft_Corporation, Microsoft_Visual_C..."
4,/home/ffri/WorkDir/FFRIDataset/PackerDetectorC...,,,,[],[],False,[]


In [6]:
make_detection_reasons_columns(df_nonpacked)
make_detection_reasons_columns(df_packed)

In [7]:
df_nonpacked["pypacker_packed"] = df_nonpacked["pypacker_reason"].apply(lambda x: len(x) != 0)
df_nonpacked["manalyze_packed"] = df_nonpacked["manalyze_reason"].apply(lambda x: len(x) != 0 if x else False)

In [8]:
df_packed["pypacker_packed"] = df_packed["pypacker_reason"].apply(lambda x: len(x) != 0)
df_packed["manalyze_packed"] = df_packed["manalyze_reason"].apply(lambda x: len(x) != 0 if x else False)

In [9]:
def is_packed(flags):
    for flag in flags:
        if flag in (ManalyzeDetectionReason.KNOWN_SECTION_NAME, ManalyzeDetectionReason.BROKEN_RESOURCE, ManalyzeDetectionReason.BROKEN_RITCH_HEADER, ManalyzeDetectionReason.SUMMARY_PACKED_KNOWN):
            return True
    return False

df_packed["manalyze_packed_without_unusual_sect"] = df_packed["manalyze_reason"].apply(lambda x: is_packed(x))
df_nonpacked["manalyze_packed_without_unusual_sect"] = df_nonpacked["manalyze_reason"].apply(lambda x: is_packed(x))

In [10]:
# FPR
print("FPR (PyPackerDetect)", df_nonpacked["pypacker_packed"].sum() / len(df_nonpacked["pypacker_packed"]) * 100.)
print("FPR (Manalyze) ", df_nonpacked["manalyze_packed"].sum() / len(df_nonpacked["manalyze_packed"]) * 100.)
print("FPR (pypeid)", df_nonpacked["peid_packed"].sum() / len(df_nonpacked["peid_packed"]) * 100.0)

FPR (PyPackerDetect) 2.2271714922048997
FPR (Manalyze)  40.97995545657015
FPR (pypeid) 5.56792873051225


In [11]:
# TPR
print("TPR (PyPackerDetect)", df_packed["pypacker_packed"].sum() / len(df_packed["pypacker_packed"]) * 100.)
print("TPR (Manalyze) ", df_packed["manalyze_packed"].sum() / len(df_packed["manalyze_packed"]) * 100.)
print("TPR (pypeid)", df_packed["peid_packed"].sum() / len(df_packed["peid_packed"]) * 100.0)

TPR (PyPackerDetect) 94.76110645431685
TPR (Manalyze)  95.09639564124058
TPR (pypeid) 84.87007544006705


## Result
- Recall is almost the same between PyPacker and Manalyze; its value is about 94%.
- On the other hand, FPR was much lower when using PyPacker compared with Manalyze.

In [12]:
from collections import defaultdict

def flatten_list(l):
    result = list()
    for i in l:
        result += i
    return result

def calc_hist(l):
    d = defaultdict(int)
    for i in l:
        d[i] += 1
    return d

In [13]:
# Detection reasons of PyPackerDetect for false-positive samples
calc_hist(flatten_list(list(df_nonpacked[df_nonpacked["pypacker_packed"]]["pypacker_reason"])))

defaultdict(int,
            {<PyPackerDetectionReason.BAD_ENTRY_POINT: 4>: 5,
             <PyPackerDetectionReason.NONSTANDARD_SECTIONNAME: 2>: 1,
             <PyPackerDetectionReason.SECTION_NAME_IS_KNOWN: 3>: 9})

In [14]:
# Detection reasons of Manalyze for false-positive samples
calc_hist(flatten_list(list(df_nonpacked[df_nonpacked["manalyze_packed"]]["manalyze_reason"])))

defaultdict(int,
            {<ManalyzeDetectionReason.UNUSUAL_SECTION_NAME: 1>: 190,
             <ManalyzeDetectionReason.SUMMARY_PACKED_KNOWN: 7>: 11,
             <ManalyzeDetectionReason.BROKEN_RITCH_HEADER: 5>: 5})

## Analysis of detection reasons of two packer detection tools for FP samples
- FPs in both PyPacker and Manalyze are mainly caused by "unusual section names".
  - About 98% of FPs in Manalyze
- FP can be reduced by excluding cases where the reason is "unusual section names".
  - FP can be reduced to about 1.1% for both Manalyze and PyPackeur
  - But, note that excluding "unusual section names" rule could reduce the TPR.

In [15]:
# Result for TPR and FPR for manalyze when the "unusual section names" detection reason is excluded
print("FPR (Manalyze, 'unusual section names' reason is excluded)", df_nonpacked["manalyze_packed_without_unusual_sect"].sum() / len(df_nonpacked["manalyze_packed_without_unusual_sect"]) * 100.)
print("TPR (Manalyze, 'unusual section names' reason is excluded)", df_packed["manalyze_packed_without_unusual_sect"].sum() / len(df_packed["manalyze_packed_without_unusual_sect"]) * 100.)

FPR (Manalyze, 'unusual section names' reason is excluded) 2.4498886414253898
TPR (Manalyze, 'unusual section names' reason is excluded) 47.02430846605197


- When "unusual section names" is not included in Manalyze, FP can be very small, while TP drops to about 1 %.
- PyPackerDetect has a larger coverage of the "unusual section names" rule compared with Manalyze.