In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/mlproj/combinedcsv.csv'
df = pd.read_csv(file_path)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Drop columns containing only a single value
columns_drop = list()
for column in df.columns:
    unique_values = df[column].unique()
    if len(unique_values) == 1:
        columns_drop.append(column)

df.drop(columns_drop, axis=1, inplace=True)
print(f"Dropped columns: {len(columns_drop)}", columns_drop)

# Separate the target variable ('malicious' column)
X = df.drop(columns=['malicious', 'md5_hash'])
y = df['malicious']
# Split the data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the features (important for logistic regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

# GridSearch to find the best C for L1 regularization in Logistic Regression
param_grid_logistic = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1'],
    'solver': ['liblinear'],  # liblinear is a good choice for small datasets and L1 penalty
    'max_iter': [10000]
}

grid_search_logistic = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid_logistic, cv=5, error_score='raise', n_jobs=2)
grid_search_logistic.fit(X_train, y_train)

# Use the best estimator to identify important features
best_logistic = grid_search_logistic.best_estimator_
important_features = np.where(best_logistic.coef_[0] != 0)[0]

# Select only the important features for the Random Forest model
X_train_selected = X_train[:, important_features]
X_valid_selected = X_valid[:, important_features]
X_test_selected = X_test[:, important_features]

# Train a Random Forest Classifier on the selected features
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train_selected, y_train)

# Make predictions and evaluate the model on the validation data
y_valid_pred_rf = clf_rf.predict(X_valid_selected)
valid_accuracy_rf = accuracy_score(y_valid, y_valid_pred_rf)
print("Validation Accuracy with Random Forest:", valid_accuracy_rf)

# Make predictions and evaluate the model on the test data
y_test_pred_rf = clf_rf.predict(X_test_selected)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
print("Testing Accuracy with Random Forest:", test_accuracy_rf)

# Generate and print the confusion matrix and classification report for testing data
confusion_rf = confusion_matrix(y_test, y_test_pred_rf)
print("Confusion Matrix (Testing) with Random Forest:")
print(confusion_rf)

classification_rep_rf = classification_report(y_test, y_test_pred_rf)
print("Classification Report (Testing) with Random Forest:")
print(classification_rep_rf)

Dropped columns: 295 ['ACCESS_BLOBS_ACROSS_USERS', 'BIND_CARRIER_MESSAGING_CLIENT_SERVICE', 'BIND_COMPANION_DEVICE_SERVICE', 'BIND_CONTROLS', 'BIND_QUICK_ACCESS_WALLET_SERVICE', 'BIND_TV_INTERACTIVE_APP', 'BIND_VISUAL_VOICEMAIL_SERVICE', 'BODY_SENSORS_BACKGROUND', 'DELIVER_COMPANION_MESSAGES', 'HIDE_OVERLAY_WINDOWS', 'LAUNCH_MULTI_PANE_SETTINGS_DEEP_LINK', 'LOADER_USAGE_STATS', 'MANAGE_MEDIA', 'MANAGE_ONGOING_CALLS', 'MANAGE_WIFI_INTERFACES', 'MANAGE_WIFI_NETWORK_SELECTION', 'NEARBY_WIFI_DEVICES', 'NFC_PREFERRED_PAYMENT_INFO', 'READ_ASSISTANT_APP_SEARCH_DATA', 'READ_BASIC_PHONE_STATE', 'READ_HOME_APP_SEARCH_DATA', 'READ_NEARBY_STREAMING_POLICY', 'READ_VOICEMAIL', 'REQUEST_COMPANION_PROFILE_APP_STREAMING', 'REQUEST_COMPANION_PROFILE_AUTOMOTIVE_PROJECTION', 'REQUEST_COMPANION_PROFILE_COMPUTER', 'REQUEST_COMPANION_PROFILE_WATCH', 'REQUEST_COMPANION_SELF_MANAGED', 'REQUEST_COMPANION_START_FOREGROUND_SERVICES_FROM_BACKGROUND', 'REQUEST_OBSERVE_COMPANION_DEVICE_PRESENCE', 'SMS_FINANCIAL_TRAN