In [1]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest
import numpy as np

# Load CSV files with 'after_reboot' in the filename
path = r'C:\Users\USER\Documents\NTUST\Conference_Workshop_Seminar\Android\Dataset\AndMal2020-dynamic-BeforeAndAfterReboot\Cleaned_Files\normalized_dataset'
files = glob.glob(path + '/*after_reboot*.csv')

dataframes = []
labels = []

for file in files:
    df = pd.read_csv(file)
    features = df.drop(columns=['Category', 'Family'])
    dataframes.append(features)
    labels.append(df['Category'])

# Combine all feature dataframes
X = pd.concat(dataframes, ignore_index=True)
y = pd.concat(labels, ignore_index=True)

# Print the number of features in the original dataset
num_features = X.shape[1]
print(f'Number of features in the original dataset: {num_features}')

# Impute missing values first
imp = SimpleImputer(strategy='mean')
X_imputed = imp.fit_transform(X)

# Convert the imputed array back to a DataFrame for feature selection
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

### Function to Evaluate Random Forest Performance ###
def evaluate_random_forest(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Apply PCA
    pca = PCA(n_components=0.95)  # Retain 95% of variance
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    model = RandomForestClassifier()  # Changed to Random Forest
    model.fit(X_train_pca, y_train)
    predictions = model.predict(X_test_pca)
    return accuracy_score(y_test, predictions)

### Feature Extraction Algorithms ###
def ga_pso(X, y):
    # Placeholder for GA-PSO feature selection logic
    selected_features = X.sample(n=90, axis=1)  # Randomly select 90 features for demonstration
    fitness = evaluate_random_forest(selected_features, y)
    return selected_features, fitness

def aco_sa(X, y):
    # Placeholder for ACO-SA feature selection logic
    selected_features = X.iloc[:, 65:85]  # Select features from index 65 to 85 for demonstration
    fitness = evaluate_random_forest(selected_features, y)
    return selected_features, fitness

# Extract features using optimization methods
features_ga_pso, fitness_ga_pso = ga_pso(X_imputed_df, y)
features_aco_sa, fitness_aco_sa = aco_sa(X_imputed_df, y)

# Print fitness scores for individual algorithms
print(f'GA-PSO Fitness: {fitness_ga_pso}')
print(f'ACO-SA Fitness: {fitness_aco_sa}')

# Combine unique features from both methods
combined_features = pd.concat([features_ga_pso, features_aco_sa], axis=1).loc[:, ~pd.concat([features_ga_pso, features_aco_sa], axis=1).columns.duplicated()]

# Evaluate the model on combined features
final_fitness_combined = evaluate_random_forest(combined_features, y)
print(f'Final Model Accuracy with Combined Features: {final_fitness_combined}')

Number of features in the original dataset: 125
GA-PSO Fitness: 0.9598275079437131
ACO-SA Fitness: 0.8935542442124376
Final Model Accuracy with Combined Features: 0.9620971402632773


In [None]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE  # Import SMOTE
import numpy as np

# Load CSV files with 'after_reboot' in the filename
path = r'C:\Users\USER\Documents\NTUST\Conference_Workshop_Seminar\Android\Dataset\AndMal2020-dynamic-BeforeAndAfterReboot\Cleaned_Files\normalized_dataset'
files = glob.glob(path + '/*after_reboot*.csv')

dataframes = []
labels = []

for file in files:
    df = pd.read_csv(file)
    features = df.drop(columns=['Category', 'Family'])
    dataframes.append(features)
    labels.append(df['Category'])

# Combine all feature dataframes
X = pd.concat(dataframes, ignore_index=True)
y = pd.concat(labels, ignore_index=True)

# Print the number of features in the original dataset
num_features = X.shape[1]
print(f'Number of features in the original dataset: {num_features}')

# Impute missing values first
imp = SimpleImputer(strategy='mean')
X_imputed = imp.fit_transform(X)

# Convert the imputed array back to a DataFrame for feature selection
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

### Function to Evaluate Random Forest Performance ###
def evaluate_random_forest(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Apply PCA
    pca = PCA(n_components=0.95)  # Retain 95% of variance
    X_train_pca = pca.fit_transform(X_train_resampled)
    X_test_pca = pca.transform(X_test)
    
    model = RandomForestClassifier()
    model.fit(X_train_pca, y_train_resampled)
    predictions = model.predict(X_test_pca)
    return accuracy_score(y_test, predictions)

### Feature Extraction Algorithms ###
def ga_pso(X, y):
    # Placeholder for GA-PSO feature selection logic
    selected_features = X.sample(n=95, axis=1)  # Randomly select 90 features for demonstration
    fitness = evaluate_random_forest(selected_features, y)
    return selected_features, fitness

def aco_sa(X, y):
    # Placeholder for ACO-SA feature selection logic
    selected_features = X.iloc[:, 65:85]  # Select features from index 65 to 85 for demonstration
    fitness = evaluate_random_forest(selected_features, y)
    return selected_features, fitness

# Extract features using optimization methods
features_ga_pso, fitness_ga_pso = ga_pso(X_imputed_df, y)
features_aco_sa, fitness_aco_sa = aco_sa(X_imputed_df, y)

# Print fitness scores for individual algorithms
print(f'GA-PSO Fitness: {fitness_ga_pso}')
print(f'ACO-SA Fitness: {fitness_aco_sa}')

# Combine unique features from both methods
combined_features = pd.concat([features_ga_pso, features_aco_sa], axis=1).loc[:, ~pd.concat([features_ga_pso, features_aco_sa], axis=1).columns.duplicated()]

# Evaluate the model on combined features
final_fitness_combined = evaluate_random_forest(combined_features, y)
print(f'Final Model Accuracy with Combined Features: {final_fitness_combined}')

Number of features in the original dataset: 125
GA-PSO Fitness: 0.9652746255106672
ACO-SA Fitness: 0.880390376758965
Final Model Accuracy with Combined Features: 0.9652746255106672


In [17]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
import numpy as np

# Load CSV files with 'after_reboot' in the filename
path = r'C:\Users\USER\Documents\NTUST\Conference_Workshop_Seminar\Android\Dataset\AndMal2020-dynamic-BeforeAndAfterReboot\Cleaned_Files\normalized_dataset'
files = glob.glob(path + '/*after_reboot*.csv')

dataframes = []
labels = []

for file in files:
    df = pd.read_csv(file)
    features = df.drop(columns=['Category', 'Family'])
    dataframes.append(features)
    labels.append(df['Category'])

# Combine all feature dataframes
X = pd.concat(dataframes, ignore_index=True)
y = pd.concat(labels, ignore_index=True)

# Impute missing values first
imp = SimpleImputer(strategy='mean')
X_imputed = imp.fit_transform(X)

# Convert the imputed array back to a DataFrame for feature selection
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

### Function to Evaluate Ensemble Method Performance ###
def evaluate_ensemble(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Apply PCA with fewer components
    pca = PCA(n_components=0.85)  # Retain 85% of variance
    X_train_pca = pca.fit_transform(X_train_resampled)
    X_test_pca = pca.transform(X_test)

    # Define classifiers
    svm = SVC(probability=True, random_state=42)
    rf = RandomForestClassifier(random_state=42)
    lr = LogisticRegression(max_iter=100, random_state=42)

    # Hyperparameter tuning for each classifier
    param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}  # Expanded grid
    param_grid_rf = {'n_estimators': [50, 100, 200]}  # Expanded grid
    param_grid_lr = {'C': [0.1, 1, 10]}  # Expanded grid

    # Perform GridSearchCV for each classifier
    svm_grid = GridSearchCV(svm, param_grid_svm, cv=3, scoring='accuracy', n_jobs=-1)
    rf_grid = GridSearchCV(rf, param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)
    lr_grid = GridSearchCV(lr, param_grid_lr, cv=3, scoring='accuracy', n_jobs=-1)

    # Fit the GridSearchCV models
    svm_grid.fit(X_train_pca, y_train_resampled)
    rf_grid.fit(X_train_pca, y_train_resampled)
    lr_grid.fit(X_train_pca, y_train_resampled)

    # Get the best estimators
    best_svm = svm_grid.best_estimator_
    best_rf = rf_grid.best_estimator_
    best_lr = lr_grid.best_estimator_

    # Create the ensemble model with the best estimators
    ensemble_model = VotingClassifier(
        estimators=[
            ('svm', best_svm),
            ('rf', best_rf),
            ('lr', best_lr)
        ],
        voting='soft',  # Use 'soft' voting for probability-based voting
        n_jobs=-1
    )

    # Fit the ensemble model
    ensemble_model.fit(X_train_pca, y_train_resampled)
    predictions = ensemble_model.predict(X_test_pca)
    return accuracy_score(y_test, predictions)

### GA-PSO Feature Selection Algorithm ###
def ga_pso(X, y):
    # Use SelectKBest for efficient feature selection
    selector = SelectKBest(score_func=f_classif, k=95)  # Select top 95 features
    selected_features = selector.fit_transform(X, y)
    selected_feature_names = X.columns[selector.get_support()]
    selected_features_df = pd.DataFrame(selected_features, columns=selected_feature_names)

    # Evaluate the ensemble model on selected features
    fitness = evaluate_ensemble(selected_features_df, y)
    return selected_features_df, fitness

# Extract features using GA-PSO and evaluate the ensemble model
features_ga_pso, fitness_ga_pso = ga_pso(X_imputed_df, y)

# Print fitness score for GA-PSO
print(f'GA-PSO Fitness: {fitness_ga_pso}')

GA-PSO Fitness: 0.9825238311393554


In [None]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
import numpy as np

# Load CSV files with 'after_reboot' in the filename
path = r'C:\Users\USER\Documents\NTUST\Conference_Workshop_Seminar\Android\Dataset\AndMal2020-dynamic-BeforeAndAfterReboot\Cleaned_Files\normalized_dataset'
files = glob.glob(path + '/*after_reboot*.csv')

dataframes = []
labels = []

for file in files:
    df = pd.read_csv(file)
    features = df.drop(columns=['Category', 'Family'])
    dataframes.append(features)
    labels.append(df['Category'])

# Combine all feature dataframes
X = pd.concat(dataframes, ignore_index=True)
y = pd.concat(labels, ignore_index=True)

# Impute missing values first
imp = SimpleImputer(strategy='mean')
X_imputed = imp.fit_transform(X)

# Convert the imputed array back to a DataFrame for feature selection
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

### Function to Evaluate Ensemble Method Performance ###
def evaluate_ensemble(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Apply PCA with fewer components
    pca = PCA(n_components=0.85)  # Retain 85% of variance
    X_train_pca = pca.fit_transform(X_train_resampled)
    X_test_pca = pca.transform(X_test)

    # Define classifiers
    svm = SVC(probability=True, random_state=42)
    rf = RandomForestClassifier(random_state=42)
    lr = LogisticRegression(max_iter=100, random_state=42)

    # Hyperparameter tuning for each classifier
    param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    param_grid_rf = {'n_estimators': [50, 100, 200]}
    param_grid_lr = {'C': [0.1, 1, 10]}

    # Perform GridSearchCV for each classifier
    svm_grid = GridSearchCV(svm, param_grid_svm, cv=3, scoring='accuracy', n_jobs=-1)
    rf_grid = GridSearchCV(rf, param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)
    lr_grid = GridSearchCV(lr, param_grid_lr, cv=3, scoring='accuracy', n_jobs=-1)

    # Fit the GridSearchCV models
    svm_grid.fit(X_train_pca, y_train_resampled)
    rf_grid.fit(X_train_pca, y_train_resampled)
    lr_grid.fit(X_train_pca, y_train_resampled)

    # Get the best estimators
    best_svm = svm_grid.best_estimator_
    best_rf = rf_grid.best_estimator_
    best_lr = lr_grid.best_estimator_

    # Create the ensemble model with the best estimators
    ensemble_model = VotingClassifier(
        estimators=[
            ('svm', best_svm),
            ('rf', best_rf),
            ('lr', best_lr)
        ],
        voting='soft',
        n_jobs=-1
    )

    # Fit the ensemble model
    ensemble_model.fit(X_train_pca, y_train_resampled)
    predictions = ensemble_model.predict(X_test_pca)

    # Print classification report
    print(classification_report(y_test, predictions))

    return accuracy_score(y_test, predictions)

### GA-PSO Feature Selection Algorithm ###
def ga_pso(X, y):
    # Use SelectKBest for efficient feature selection
    selector = SelectKBest(score_func=f_classif, k=95)  # Select top 95 features
    selected_features = selector.fit_transform(X, y)
    selected_feature_names = X.columns[selector.get_support()]
    selected_features_df = pd.DataFrame(selected_features, columns=selected_feature_names)

    # Evaluate the ensemble model on selected features
    fitness = evaluate_ensemble(selected_features_df, y)
    return selected_features_df, fitness

# Extract features using GA-PSO and evaluate the ensemble model
features_ga_pso, fitness_ga_pso = ga_pso(X_imputed_df, y)

# Print fitness score for GA-PSO
print(f'GA-PSO Fitness: {fitness_ga_pso}')

In [5]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
import numpy as np

In [6]:
# Load CSV files with 'after_reboot' in the filename
path = r'C:\Users\USER\Documents\NTUST\Conference_Workshop_Seminar\Android\Dataset\AndMal2020-dynamic-BeforeAndAfterReboot\Cleaned_Files\normalized_dataset'
files = glob.glob(path + '/*after_reboot*.csv')

dataframes = []
labels = []

for file in files:
    df = pd.read_csv(file)
    features = df.drop(columns=['Category', 'Family'])
    dataframes.append(features)
    labels.append(df['Category'])

In [7]:

# Combine all feature dataframes
X = pd.concat(dataframes, ignore_index=True)
y = pd.concat(labels, ignore_index=True)

# Impute missing values first
imp = SimpleImputer(strategy='mean')
X_imputed = imp.fit_transform(X)

# Convert the imputed array back to a DataFrame for feature selection
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)


In [None]:
### Function to Evaluate SVM Performance ###
def evaluate_svm(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Apply PCA with fewer components
    pca = PCA(n_components=0.85)  # Retain 85% of variance
    X_train_pca = pca.fit_transform(X_train_resampled)
    X_test_pca = pca.transform(X_test)

    # Define the SVM classifier
    svm = SVC(probability=True, random_state=42)

    # Hyperparameter tuning for SVM
    param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    svm_grid = GridSearchCV(svm, param_grid_svm, cv=3, scoring='accuracy', n_jobs=-1)

    # Fit the GridSearchCV model
    svm_grid.fit(X_train_pca, y_train_resampled)

    # Get the best estimator
    best_svm = svm_grid.best_estimator_

    # Fit the best SVM model and make predictions
    best_svm.fit(X_train_pca, y_train_resampled)
    predictions = best_svm.predict(X_test_pca)

    # Print classification report
    print(classification_report(y_test, predictions))

    return accuracy_score(y_test, predictions)

### GA-PSO Feature Selection Algorithm ###
def ga_pso(X, y):
    # Use SelectKBest for efficient feature selection
    selector = SelectKBest(score_func=f_classif, k=95)  # Select top 95 features
    selected_features = selector.fit_transform(X, y)
    selected_feature_names = X.columns[selector.get_support()]
    selected_features_df = pd.DataFrame(selected_features, columns=selected_feature_names)

    # Evaluate the SVM model on selected features
    fitness = evaluate_svm(selected_features_df, y)
    return selected_features_df, fitness

# Extract features using GA-PSO and evaluate the SVM model
features_ga_pso, fitness_ga_pso = ga_pso(X_imputed_df, y)

# Print fitness score for GA-PSO
print(f'GA-PSO Fitness: {fitness_ga_pso}')

                precision    recall  f1-score   support

        Adware       0.99      0.99      0.99      1018
      Backdoor       0.98      0.93      0.95       109
  FileInfector       1.00      0.91      0.95        22
           PUA       0.97      0.97      0.97       155
    Ransomware       0.98      0.99      0.99       311
      Riskware       0.99      0.99      0.99      1368
     Scareware       0.97      0.96      0.97        81
        Trojan       0.99      0.99      0.99       791
 Trojan_Banker       0.91      0.81      0.86        26
Trojan_Dropper       0.95      0.95      0.95       151
    Trojan_SMS       0.96      0.98      0.97       181
    Trojan_Spy       0.98      0.98      0.98       193

      accuracy                           0.98      4406
     macro avg       0.97      0.95      0.96      4406
  weighted avg       0.98      0.98      0.98      4406

GA-PSO Fitness: 0.9843395369950068
