In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import joblib

In [2]:
df = pd.read_csv('Malware_Deep.csv')

In [3]:
df.head()

Unnamed: 0,hash,millisecond,classification,state,usage_counter,prio,static_prio,normal_prio,policy,vm_pgoff,...,nivcsw,min_flt,maj_flt,fs_excl_counter,lock,utime,stime,gtime,cgtime,signal_nvcsw
0,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,0,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0
1,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,1,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0
2,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,2,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0
3,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,3,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0
4,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,4,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0


In [4]:
df['classification'] = df['classification'].replace({'malware':1, 'benign':0})

In [5]:
df['hash_length'] = df['hash'].apply(len)

In [6]:
df.head(10000)

Unnamed: 0,hash,millisecond,classification,state,usage_counter,prio,static_prio,normal_prio,policy,vm_pgoff,...,min_flt,maj_flt,fs_excl_counter,lock,utime,stime,gtime,cgtime,signal_nvcsw,hash_length
0,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,0,1,0,0,3069378560,14274,0,0,0,...,0,120,0,3204448256,380690,4,0,0,0,64
1,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,1,1,0,0,3069378560,14274,0,0,0,...,0,120,0,3204448256,380690,4,0,0,0,64
2,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,2,1,0,0,3069378560,14274,0,0,0,...,0,120,0,3204448256,380690,4,0,0,0,64
3,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,3,1,0,0,3069378560,14274,0,0,0,...,0,120,0,3204448256,380690,4,0,0,0,64
4,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,4,1,0,0,3069378560,14274,0,0,0,...,0,120,0,3204448256,380690,4,0,0,0,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,com.vbsmojivy.mianzed.apk,995,0,1036288,0,3069939712,30736,0,0,0,...,0,114,8,3204448256,420668,5,4,0,0,25
9996,com.vbsmojivy.mianzed.apk,996,0,0,0,3069939712,30736,0,0,0,...,0,114,8,3204448256,420669,5,4,0,0,25
9997,com.vbsmojivy.mianzed.apk,997,0,0,0,3069939712,30736,0,0,0,...,130,114,8,3204448256,420668,5,4,0,0,25
9998,com.vbsmojivy.mianzed.apk,998,0,0,0,3069939712,30736,0,0,0,...,0,114,8,3204448256,420670,5,4,0,0,25


In [7]:
df['hash_length'].values

array([64, 64, 64, ..., 64, 64, 64])

In [8]:
X = df[['hash_length','millisecond']].values
y = df['classification'].values

In [9]:
X.shape, y.shape

((100000, 2), (100000,))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

In [11]:
len(X_train), len(X_test), len(y_train), len(y_test)

(70000, 30000, 70000, 30000)

In [12]:
X_train.dtype,X_test.dtype, y_train.dtype

(dtype('int64'), dtype('int64'), dtype('int64'))

In [13]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train,y_train)

In [14]:
X_train.shape

(70000, 2)

In [15]:
X_test.shape

(30000, 2)

In [16]:
y_pred = model.predict(X_test)

In [17]:
accuracy = accuracy_score(y_test,y_pred)
conf_matrix = confusion_matrix(y_test,y_pred)

In [18]:
print(f"Accuracy : {accuracy:.2f}")
print("Confusion Matrix :")
print(conf_matrix)

Accuracy : 1.00
Confusion Matrix :
[[14969     0]
 [    0 15031]]


In [19]:
X_test.dtype

dtype('int64')

In [20]:
sample_input = np.array([64,10])
sample_input = sample_input.reshape(1,-1)

In [21]:
single_pred = model.predict(sample_input)

In [22]:
single_pred

array([1])

In [23]:
y_test[10000]

0

In [24]:
X_test[10000]

array([ 29, 994])

In [25]:
sample_input2 = np.array([29,994])
sample_input2 = sample_input2.reshape(1,-1)

In [26]:
single_pred2 = model.predict(sample_input2)

In [27]:
single_pred2

array([0])

In [28]:
joblib.dump(model, 'malware_model.pkl')

['malware_model.pkl']

In [29]:
loaded_model = joblib.load('malware_model.pkl')

In [30]:
loaded_model_result = model.predict(sample_input2)

In [31]:
loaded_model_result

array([0])