In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn import naive_bayes, neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



In [None]:
path_file = 'Android_Malware_Benign.csv'
dataset = pd.read_csv(path_file)
dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [92]:
# print(dataset.head(5))
# print(dataset.info())
# print(test)


def preprocess(dataset):
    dataset_prepprocess = dataset.copy()

    for column in dataset.columns:
        if dataset[column].nunique()== 1:
            # print(column)
            dataset_prepprocess.drop(column, axis=1, inplace=True)

    X = dataset_prepprocess.drop('Label', axis=1)

    y = dataset_prepprocess['Label']

    print(y.value_counts())
    y = y.replace('Malware', 1)
    y = y.replace('Benign', 0)

    # split in train, test and validation in 60 20 20

    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test, X_train_val, y_train_val

In [93]:
X_train, X_val, X_test, y_train, y_val, y_test, X_train_val, y_train_val = preprocess(dataset)

Label
Malware    2533
Benign     1931
Name: count, dtype: int64


In [None]:
# Naive Bayes classifier

for model in [naive_bayes.BernoulliNB(), naive_bayes.MultinomialNB()]:
    print(model)

    cv_scores_f1 = cross_val_score(model, X_train_val, y_train_val, cv=10, scoring='f1_macro')
    cv_scores_accuracy = cross_val_score(model, X_train_val, y_train_val, cv=10, scoring='accuracy')
    print('Cross-Validation f1_score:', np.mean(cv_scores_f1))
    print('Cross-Validation Scores f1:', cv_scores_f1)
    print('\n')
    print('Cross-Validation accuracy:', np.mean(cv_scores_accuracy))
    print('Cross-Validation Scores acc:', cv_scores_accuracy)


BernoulliNB()
Cross-Validation f1_score: 0.9176573040503962
Cross-Validation Scores f1: [0.90264581 0.89632808 0.90571353 0.94018335 0.91009237 0.94240449
 0.92246185 0.90571353 0.90523115 0.94579887]


Cross-Validation accuracy: 0.9196344459571538
Cross-Validation Scores acc: [0.90502793 0.89915966 0.90756303 0.94117647 0.91316527 0.94397759
 0.92436975 0.90756303 0.90756303 0.94677871]
MultinomialNB()
Cross-Validation f1_score: 0.8691803381247499
Cross-Validation Scores f1: [0.88911711 0.83223684 0.85757785 0.89423963 0.85386575 0.88218599
 0.88864263 0.85658301 0.86138671 0.87596784]


Cross-Validation accuracy: 0.8756599846642569
Cross-Validation Scores acc: [0.89385475 0.84313725 0.8627451  0.89915966 0.8627451  0.88795518
 0.89355742 0.8627451  0.86834734 0.88235294]


In [None]:
# test

model = naive_bayes.BernoulliNB() # best one in validation

model.fit(X_train_val, y_train_val)
y_pred = model.predict(X_test)
print('Accuracy:')
print(accuracy_score(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy:
0.9171332586786114
Confusion Matrix:
[[358  44]
 [ 30 461]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91       402
           1       0.91      0.94      0.93       491

    accuracy                           0.92       893
   macro avg       0.92      0.91      0.92       893
weighted avg       0.92      0.92      0.92       893



In [None]:
# random forest

results = []

for criterion in ['gini', 'entropy', 'log_loss']:
    for max_depth in [None, 10, 20, 30]:
        for n_estimators in [10, 50, 100, 200]:
            
            model = RandomForestClassifier(criterion=criterion, max_depth=max_depth, n_estimators=n_estimators)
            
            cv_scores_f1 = cross_val_score(model, X_train_val, y_train_val, cv=10, scoring='f1_macro')
            cv_scores_accuracy = cross_val_score(model, X_train_val, y_train_val, cv=10, scoring='accuracy')
            
            mean_f1 = np.mean(cv_scores_f1)
            mean_accuracy = np.mean(cv_scores_accuracy)
            
            results.append({
                'criterion': criterion,
                'max_depth': max_depth,
                'n_estimators': n_estimators,
                'mean_f1_score': mean_f1,
                'f1_scores': cv_scores_f1,
                'mean_accuracy': mean_accuracy,
                'accuracy_scores': cv_scores_accuracy
            })

results_df = pd.DataFrame(results)

print(results_df)

   criterion  max_depth  n_estimators  mean_f1_score  \
0       gini        NaN            10       0.951492   
1       gini        NaN            50       0.953152   
2       gini        NaN           100       0.953420   
3       gini        NaN           200       0.954293   
4       gini       10.0            10       0.945541   
5       gini       10.0            50       0.954666   
6       gini       10.0           100       0.956905   
7       gini       10.0           200       0.956351   
8       gini       20.0            10       0.952354   
9       gini       20.0            50       0.954882   
10      gini       20.0           100       0.955453   
11      gini       20.0           200       0.956600   
12      gini       30.0            10       0.947006   
13      gini       30.0            50       0.950353   
14      gini       30.0           100       0.954003   
15      gini       30.0           200       0.954565   
16   entropy        NaN            10       0.94

In [None]:
# Top 10 results based on mean F1 score
top_f1 = results_df.sort_values(by='mean_f1_score', ascending=False).head(10)
print("Top 10 results based on F1 score:")
print(top_f1[['criterion', 'max_depth', 'n_estimators', 'mean_f1_score']])

# Top 10 results based on mean accuracy
top_accuracy = results_df.sort_values(by='mean_accuracy', ascending=False).head(10)
print("\nTop 10 results based on Accuracy:")
print(top_accuracy[['criterion', 'max_depth', 'n_estimators', 'mean_accuracy']])


Top 10 results based on F1 score:
   criterion  max_depth  n_estimators  mean_f1_score
39  log_loss       10.0           200       0.957785
6       gini       10.0           100       0.956905
11      gini       20.0           200       0.956600
26   entropy       20.0           100       0.956594
38  log_loss       10.0           100       0.956355
7       gini       10.0           200       0.956351
22   entropy       10.0           100       0.956347
27   entropy       20.0           200       0.956026
25   entropy       20.0            50       0.956026
43  log_loss       20.0           200       0.956014

Top 10 results based on Accuracy:
   criterion  max_depth  n_estimators  mean_accuracy
27   entropy       20.0           200       0.959115
23   entropy       10.0           200       0.958557
42  log_loss       20.0           100       0.958555
6       gini       10.0           100       0.957996
11      gini       20.0           200       0.957996
41  log_loss       20.0       

In [None]:
best_configs = [
    {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 200},
    {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100},
    {'criterion': 'gini', 'max_depth': 20, 'n_estimators': 200}
]

# Test results
test_results = []

for config in best_configs:
    model = RandomForestClassifier(
        criterion=config['criterion'],
        max_depth=config['max_depth'],
        n_estimators=config['n_estimators']
    )
    model.fit(X_train_val, y_train_val) 
    
    y_test_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred, average='macro')
    class_report = classification_report(y_test, y_test_pred)
    conf_matrix = confusion_matrix(y_test, y_test_pred)
    
    test_results.append({
        'config': config,
        'accuracy': accuracy,
        'f1_score': f1,
        'classification_report': class_report,
        'confusion_matrix': conf_matrix
    })

for result in test_results:
    print(f"Configuration: {result['config']}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print(f"F1 Score: {result['f1_score']:.4f}")
    print("Classification Report:\n", result['classification_report'])
    print("Confusion Matrix:\n", result['confusion_matrix'])
    print("\n" + "-"*50 + "\n")

Configuration: {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 200}
Accuracy: 0.9552
F1 Score: 0.9546
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       402
           1       0.95      0.97      0.96       491

    accuracy                           0.96       893
   macro avg       0.96      0.95      0.95       893
weighted avg       0.96      0.96      0.96       893

Confusion Matrix:
 [[376  26]
 [ 14 477]]

--------------------------------------------------

Configuration: {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100}
Accuracy: 0.9574
F1 Score: 0.9570
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95       402
           1       0.96      0.96      0.96       491

    accuracy                           0.96       893
   macro avg       0.96      0.96      0.96       893
weighted avg       0.96      0.96    

In [129]:
# print 10 best features names for the best model config

model = RandomForestClassifier(criterion='gini', max_depth=10, n_estimators=100)
model.fit(X_train_val, y_train_val)

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

print("Top 10 features:")
for i in range(30):
    print(f"{i+1}. {X_train_val.columns[indices[i]]} ({importances[indices[i]]})")


Top 10 features:
1. android.permission.READ_PHONE_STATE (0.23729097987323172)
2. com.google.android.c2dm.permission.RECEIVE (0.08394321672690427)
3. android.permission.RECEIVE_BOOT_COMPLETED (0.08260028636978257)
4. com.android.launcher.permission.INSTALL_SHORTCUT (0.058281860769018916)
5. android.permission.ACCESS_FINE_LOCATION (0.04676318566613568)
6. android.permission.ACCESS_COARSE_LOCATION (0.034635535410131195)
7. Ljava/net/URL;->openConnection (0.026789246906548925)
8. android.permission.WAKE_LOCK (0.02675007125228695)
9. RECEIVE_BOOT_COMPLETED (0.026501829666032872)
10. Landroid/location/LocationManager;->getLastKgoodwarewnLocation (0.024950426576223716)
11. android.permission.SEND_SMS (0.01857871894096701)
12. android.permission.READ_SMS (0.017296099112805427)
13. GET_TASKS (0.01679634326463875)
14. android.permission.READ_EXTERNAL_STORAGE (0.013566741130133113)
15. android.permission.CAMERA (0.012978117642152745)
16. android.permission.RECEIVE_SMS (0.012969360214460473)
17. a

In [130]:
# decision tree

dt_results_list = []

for criterion in ['gini', 'entropy']:
    for max_depth in [None, 10, 20, 30, 100, 200]:  
        model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
        
        cv_scores_f1 = cross_val_score(model, X_train_val, y_train_val, cv=10, scoring='f1_macro')
        cv_scores_accuracy = cross_val_score(model, X_train_val, y_train_val, cv=10, scoring='accuracy')
        
        mean_f1 = np.mean(cv_scores_f1)
        mean_accuracy = np.mean(cv_scores_accuracy)
        
        dt_results_list.append({
            'criterion': criterion,
            'max_depth': max_depth,
            'mean_f1_score': mean_f1,
            'mean_accuracy': mean_accuracy
        })

dt_results_df = pd.DataFrame(dt_results_list)

# print(dt_results_df)

print("\nTop 10 results by F1 score:")
print(dt_results_df.sort_values(by='mean_f1_score', ascending=False).head(10))

print("\nTop 10 results by Accuracy:")
print(dt_results_df.sort_values(by='mean_accuracy', ascending=False).head(10))



Top 10 results by F1 score:
   criterion  max_depth  mean_f1_score  mean_accuracy
7    entropy       10.0       0.942543       0.943715
1       gini       10.0       0.938012       0.940633
6    entropy        NaN       0.936443       0.935871
10   entropy      100.0       0.936157       0.935311
8    entropy       20.0       0.934177       0.940072
11   entropy      200.0       0.933856       0.935032
9    entropy       30.0       0.933315       0.936152
4       gini      100.0       0.931727       0.932236
2       gini       20.0       0.930845       0.934753
0       gini        NaN       0.930583       0.933914

Top 10 results by Accuracy:
   criterion  max_depth  mean_f1_score  mean_accuracy
7    entropy       10.0       0.942543       0.943715
1       gini       10.0       0.938012       0.940633
8    entropy       20.0       0.934177       0.940072
9    entropy       30.0       0.933315       0.936152
6    entropy        NaN       0.936443       0.935871
10   entropy      100.0 

In [None]:
# svm 

svm_results_list = []

for kernel in ['linear', 'rbf', 'poly']:
    for C in [0.1, 1, 10, 100]:  
        for gamma in ['scale', 'auto', 0.01, 0.1, 1]:
            if kernel == 'linear' and gamma != 'scale':
                continue  # Gamma not used in linear kernel

            model = SVC(kernel=kernel, C=C, gamma=gamma)
            
            cv_scores_f1 = cross_val_score(model, X_train_val, y_train_val, cv=10, scoring='f1_macro')
            cv_scores_accuracy = cross_val_score(model, X_train_val, y_train_val, cv=10, scoring='accuracy')
            
            mean_f1 = np.mean(cv_scores_f1)
            mean_accuracy = np.mean(cv_scores_accuracy)
            
            svm_results_list.append({
                'kernel': kernel,
                'C': C,
                'gamma': gamma,
                'mean_f1_score': mean_f1,
                'mean_accuracy': mean_accuracy,
            })

svm_results_df = pd.DataFrame(svm_results_list)

# Display the top 10 configurations by F1 score and accuracy
print("\nTop 5 results by F1 score:")
print(svm_results_df.sort_values(by='mean_f1_score', ascending=False).head(10))

print("\nTop 5 results by Accuracy:")
print(svm_results_df.sort_values(by='mean_accuracy', ascending=False).head(10))



Top 5 results by F1 score:
    kernel      C  gamma  mean_f1_score  mean_accuracy
16     rbf   10.0   0.01       0.959199       0.959955
14     rbf   10.0  scale       0.959131       0.959956
20     rbf  100.0   auto       0.958577       0.959395
0   linear    0.1  scale       0.956929       0.957713
21     rbf  100.0   0.01       0.956573       0.957435
15     rbf   10.0   auto       0.955738       0.956593
1   linear    1.0  scale       0.955431       0.956317
2   linear   10.0  scale       0.955026       0.956038
11     rbf    1.0   0.01       0.954564       0.955474
9      rbf    1.0  scale       0.954331       0.955193

Top 5 results by Accuracy:
    kernel      C  gamma  mean_f1_score  mean_accuracy
14     rbf   10.0  scale       0.959131       0.959956
16     rbf   10.0   0.01       0.959199       0.959955
20     rbf  100.0   auto       0.958577       0.959395
0   linear    0.1  scale       0.956929       0.957713
21     rbf  100.0   0.01       0.956573       0.957435
15     rb

In [None]:

# Top three configs 
top_parameters = [
    {'kernel': 'rbf', 'C': 10.0, 'gamma': 0.01},
    {'kernel': 'rbf', 'C': 10.0, 'gamma': 'scale'},
    {'kernel': 'rbf', 'C': 100.0, 'gamma': 'auto'}
]

for params in top_parameters:
    model = SVC(kernel=params['kernel'], C=params['C'], gamma=params['gamma'], class_weight='balanced')
    
    model.fit(X_train_val, y_train_val)
    
    y_pred = model.predict(X_test)
    
    test_f1 = f1_score(y_test, y_pred, average='macro')
    test_accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Parameters: {params}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}\n")


Parameters: {'kernel': 'rbf', 'C': 10.0, 'gamma': 0.01}
Test F1 Score: 0.9638
Test Accuracy: 0.9642

Parameters: {'kernel': 'rbf', 'C': 10.0, 'gamma': 'scale'}
Test F1 Score: 0.9649
Test Accuracy: 0.9653

Parameters: {'kernel': 'rbf', 'C': 100.0, 'gamma': 'auto'}
Test F1 Score: 0.9615
Test Accuracy: 0.9619

