In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import pickle
import os
from  pdfid_v0_2_8 import pdfid
from  pdf_parser_V0_7_8 import pdf_parser
from  xml.dom import minidom
import xmltodict
import json
import subprocess


In [71]:
#Load dataset as pandas dataframe and remove any rows containing null values
df = pd.read_csv("Dataset/PDFMalware.csv")
df.dropna(how = 'any' , inplace = True)

#Get column names
print(df.columns)

Index(['Fine name', 'pdfsize', 'metadata size', 'pages', 'xref Length',
       'title characters', 'isEncrypted', 'embedded files', 'images', 'text',
       'header', 'obj', 'endobj', 'stream', 'endstream', 'xref', 'trailer',
       'startxref', 'pageno', 'encrypt', 'ObjStm', 'JS', 'Javascript', 'AA',
       'OpenAction', 'Acroform', 'JBIG2Decode', 'RichMedia', 'launch',
       'EmbeddedFile', 'XFA', 'Colors', 'Class'],
      dtype='object')


In [72]:
#Extract required features
df_features = df[["encrypt","obj","ObjStm","startxref","JBIG2Decode","Acroform","stream","Javascript","JS","OpenAction","RichMedia","launch","EmbeddedFile","XFA","AA"]]
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10023 entries, 0 to 10025
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   encrypt       10023 non-null  float64
 1   obj           10023 non-null  object 
 2   ObjStm        10023 non-null  float64
 3   startxref     10023 non-null  object 
 4   JBIG2Decode   10023 non-null  object 
 5   Acroform      10023 non-null  object 
 6   stream        10023 non-null  float64
 7   Javascript    10023 non-null  object 
 8   JS            10023 non-null  object 
 9   OpenAction    10023 non-null  object 
 10  RichMedia     10023 non-null  object 
 11  launch        10023 non-null  object 
 12  EmbeddedFile  10023 non-null  object 
 13  XFA           10023 non-null  object 
 14  AA            10023 non-null  object 
dtypes: float64(3), object(12)
memory usage: 1.2+ MB


In [73]:

#Use label encoding to encode object column to int as this is required for training the model
label_encoder = preprocessing.LabelEncoder() 

#features which need to be encoded
feature_list = ["obj","startxref","JBIG2Decode","Acroform","Javascript","JS","OpenAction","RichMedia","launch","EmbeddedFile","XFA","AA"]

#loop over features , encode them to int and store encoder models for future use
#Encoded values are stored in a new field (field_coded)
encoder_list = []
for feature_name in feature_list:
    test_feature_name = feature_name + "_coded"
    encoder = label_encoder.fit(df[feature_name]) 
    with open("Models/" + test_feature_name,'wb') as f:
        pickle.dump(encoder,f)
    df_features[test_feature_name] = label_encoder.transform(df[feature_name]) 

#Remove original un-encoded feature
df_features_transformed = df_features[['encrypt','stream','obj_coded',"ObjStm",'startxref_coded','JBIG2Decode_coded',"Acroform_coded",'Javascript_coded', 
                                 'JS_coded','OpenAction_coded','RichMedia_coded','launch_coded','EmbeddedFile_coded','XFA_coded','AA_coded']]


df_features_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10023 entries, 0 to 10025
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   encrypt             10023 non-null  float64
 1   stream              10023 non-null  float64
 2   obj_coded           10023 non-null  int32  
 3   ObjStm              10023 non-null  float64
 4   startxref_coded     10023 non-null  int32  
 5   JBIG2Decode_coded   10023 non-null  int32  
 6   Acroform_coded      10023 non-null  int32  
 7   Javascript_coded    10023 non-null  int32  
 8   JS_coded            10023 non-null  int32  
 9   OpenAction_coded    10023 non-null  int32  
 10  RichMedia_coded     10023 non-null  int32  
 11  launch_coded        10023 non-null  int32  
 12  EmbeddedFile_coded  10023 non-null  int32  
 13  XFA_coded           10023 non-null  int32  
 14  AA_coded            10023 non-null  int32  
dtypes: float64(3), int32(12)
memory usage: 783.0 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features[test_feature_name] = label_encoder.transform(df[feature_name])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features[test_feature_name] = label_encoder.transform(df[feature_name])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features[test_feature_name] = label_encoder.transfor

In [74]:

df_features_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10023 entries, 0 to 10025
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   encrypt             10023 non-null  float64
 1   stream              10023 non-null  float64
 2   obj_coded           10023 non-null  int32  
 3   ObjStm              10023 non-null  float64
 4   startxref_coded     10023 non-null  int32  
 5   JBIG2Decode_coded   10023 non-null  int32  
 6   Acroform_coded      10023 non-null  int32  
 7   Javascript_coded    10023 non-null  int32  
 8   JS_coded            10023 non-null  int32  
 9   OpenAction_coded    10023 non-null  int32  
 10  RichMedia_coded     10023 non-null  int32  
 11  launch_coded        10023 non-null  int32  
 12  EmbeddedFile_coded  10023 non-null  int32  
 13  XFA_coded           10023 non-null  int32  
 14  AA_coded            10023 non-null  int32  
dtypes: float64(3), int32(12)
memory usage: 783.0 KB


In [75]:
#Extract target column for source dataset
df_result = df["Class"]

#Convert target column value to int
target_series = pd.Series(df_result, dtype="category")
df_result_transformed = target_series.cat.codes

df_result.value_counts()


Malicious    5555
Benign       4468
Name: Class, dtype: int64

In [76]:
#split data in to 80% training data and 20 % test data
feature_train, feature_test, result_train, result_test = train_test_split(df_features_transformed, df_result_transformed, test_size=0.20, random_state=42)
print(f'feature_train : {feature_train.shape}')
print(f'feature_test : {feature_test.shape}')
print(f'result_train : {result_train.shape}')
print(f'result_test : {result_test.shape}')

feature_train : (8018, 15)
feature_test : (2005, 15)
result_train : (8018,)
result_test : (2005,)


In [77]:
#Generate Hyperparameter lists for tuning

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt', None]
# Maximum number of levels in tree
max_depth = [5,8,11,14]
# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [False]

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200], 'max_features': ['log2', 'sqrt', None], 'max_depth': [5, 8, 11, 14], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [False]}


In [78]:
#Gyper 
rfc_Model = RandomForestClassifier()
rfc_Grid = GridSearchCV(estimator = rfc_Model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4)
rfc_Grid.fit(feature_train , result_train )
rfc_Grid.best_params_


Fitting 3 folds for each of 960 candidates, totalling 2880 fits


{'bootstrap': False,
 'max_depth': 14,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 157}

In [79]:
rfc_Grid.score(feature_train,result_train)

0.986779745572462

In [80]:
rfc_Grid.score(feature_test,result_test)

0.9785536159600997

In [81]:


classifier= RandomForestClassifier(bootstrap= rfc_Grid.best_params_["bootstrap"],
 max_depth= rfc_Grid.best_params_["max_depth"],
 max_features= rfc_Grid.best_params_["max_features"],
 min_samples_leaf= rfc_Grid.best_params_["min_samples_leaf"],
 min_samples_split= rfc_Grid.best_params_["min_samples_split"],
 n_estimators= rfc_Grid.best_params_["n_estimators"])  
pdf_checker_RT = classifier.fit(feature_train,result_train)  


In [82]:
with open("Models/pdf_checker_RT",'wb') as f:
    pickle.dump(pdf_checker_RT,f)

In [83]:


pdf_file = "basic-link-1.pdf"
result = pdfid.PDFiD(pdf_file)
data_dict = pdfid.PDFiD2JSON(result,True)
json_dict = json.loads(data_dict)

command = f'python ./pdf_parser_V0_7_8/pdf_parser.py -k /URI -O ./{pdf_file}'

p = subprocess.check_output(command, stderr=subprocess.PIPE, shell=True)

uri_list = p.decode().split("\n")
print(uri_list)
if(uri_list[0] == ''):
    uri_number = 0
else:
    uri_number = len(uri_list)
print(uri_number)   




data_list = json_dict[0]["pdfid"]["keywords"]["keyword"]
data_dict_master = {}
for data in data_list:
    data_dict = {data["name"].replace("/",""):[data["count"]]}
    data_dict_master = data_dict_master | data_dict
print(data_dict_master)

df = pd.DataFrame.from_dict(data_dict_master)

df_input = df[["Encrypt","stream","obj","ObjStm","startxref","JBIG2Decode","AcroForm","JavaScript","JS","OpenAction","RichMedia","Launch","EmbeddedFile","XFA","AA"]]

df_input.rename(columns = {'Encrypt':'encrypt','JavaScript':'Javascript','AcroForm':'Acroform','Launch':'launch'}, inplace = True)

feature_list = ["xref","obj","startxref","JBIG2Decode","Acroform","Javascript","JS","OpenAction","RichMedia","launch","EmbeddedFile","XFA","AA"]


for feature_name in feature_list:
    test_feature_name = feature_name + "_coded"
   
    with open("Models/" + test_feature_name,'rb') as f:
        model = pickle.load(f)
    try:
        df_input[test_feature_name] = model.transform( [str(df_input[feature_name][0])] )
    except Exception as e:
        list(model.classes_)
        #print(e)
        continue

df_input_predict = df_input[['encrypt','stream','obj_coded','ObjStm','startxref_coded','JBIG2Decode_coded','Acroform_coded','Javascript_coded', 'JS_coded','OpenAction_coded','RichMedia_coded','launch_coded','EmbeddedFile_coded','XFA_coded','AA_coded']]

['  /URI (https://www.antennahouse.com/)\r', '  /URI (https://www.antennahouse.com/)\r', '  /URI (https://www.antennahouse.com/)\r', '  /URI (https://www.antennahouse.com/)\r', '']
5
{'obj': [45], 'endobj': [45], 'stream': [22], 'endstream': [22], 'xref': [0], 'trailer': [0], 'startxref': [1], 'Page': [2], 'Encrypt': [0], 'ObjStm': [6], 'JS': [0], 'JavaScript': [0], 'AA': [0], 'OpenAction': [0], 'AcroForm': [0], 'JBIG2Decode': [0], 'RichMedia': [0], 'Launch': [0], 'EmbeddedFile': [0], 'XFA': [0], 'Colors > 2^24': [0]}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input.rename(columns = {'Encrypt':'encrypt','JavaScript':'Javascript','AcroForm':'Acroform','Launch':'launch'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input[test_feature_name] = model.transform( [str(df_input[feature_name][0])] )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input[test_feature_name] = model.transform( [s

In [84]:
with open("Models/pdf_checker_RT",'rb') as f:
    model = pickle.load(f)
result = model.predict(df_input_predict)
print(result)

[0]
