In [2]:
# First, let's import the necessary libraries for data manipulation, machine learning, and model evaluation
import pandas as pd  # For data loading and manipulation
import numpy as np  # For handling numeric data and arrays
from sklearn.model_selection import train_test_split  # For splitting the data
from sklearn.preprocessing import StandardScaler  # For scaling the features
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier for model training
from sklearn.metrics import classification_report, confusion_matrix  # To evaluate the model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
from google.colab import files
import pandas as pd

# Upload the file manually
uploaded = files.upload()

# After uploading, read it into pandas
data = pd.read_csv("Malware dataset.csv")

print("Dataset Summary:")
print(f"Number of rows: {data.shape[0]}")
print(f"Number of columns: {data.shape[1]}")

print("\nColumn Names:")
print(data.columns)

print("\nDataset Info:")
print(data.info())

print("\nSummary Statistics:")
print(data.describe())




Saving Malware dataset.csv to Malware dataset.csv
Dataset Summary:
Number of rows: 100000
Number of columns: 35

Column Names:
Index(['hash', 'millisecond', 'classification', 'state', 'usage_counter',
       'prio', 'static_prio', 'normal_prio', 'policy', 'vm_pgoff',
       'vm_truncate_count', 'task_size', 'cached_hole_size', 'free_area_cache',
       'mm_users', 'map_count', 'hiwater_rss', 'total_vm', 'shared_vm',
       'exec_vm', 'reserved_vm', 'nr_ptes', 'end_data', 'last_interval',
       'nvcsw', 'nivcsw', 'min_flt', 'maj_flt', 'fs_excl_counter', 'lock',
       'utime', 'stime', 'gtime', 'cgtime', 'signal_nvcsw'],
      dtype='object')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 35 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hash               100000 non-null  object
 1   millisecond        100000 non-null  int64 
 2   classification     100000 n

In [4]:
# Now, let's check if there are any missing values in the dataset
print("\nMissing values in the dataset:")
print(data.isnull().sum())  # Count of missing values in each column



Missing values in the dataset:
hash                 0
millisecond          0
classification       0
state                0
usage_counter        0
prio                 0
static_prio          0
normal_prio          0
policy               0
vm_pgoff             0
vm_truncate_count    0
task_size            0
cached_hole_size     0
free_area_cache      0
mm_users             0
map_count            0
hiwater_rss          0
total_vm             0
shared_vm            0
exec_vm              0
reserved_vm          0
nr_ptes              0
end_data             0
last_interval        0
nvcsw                0
nivcsw               0
min_flt              0
maj_flt              0
fs_excl_counter      0
lock                 0
utime                0
stime                0
gtime                0
cgtime               0
signal_nvcsw         0
dtype: int64


In [5]:
# Step 1: Define features and labels
X = data.drop(['hash', 'classification'], axis=1)  # Drop 'hash' as it is an identifier, and 'classification' as it's the target variable
y = data['classification'].map({'benign': 0, 'malware': 1})  # Encode target as binary values (0 for benign, 1 for malware)

# Step 2: Split the data into training and testing sets
from sklearn.model_selection import train_test_split  # Ensure we have this import

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of training and test sets to confirm the split
print(f"Training data shape (X_train): {X_train.shape}")
print(f"Testing data shape (X_test): {X_test.shape}")


Training data shape (X_train): (80000, 33)
Testing data shape (X_test): (20000, 33)


In [6]:
# Step 4: Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  # Initialize the StandardScaler

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling applied to the training and testing data.")

Feature scaling applied to the training and testing data.


In [7]:
print(X_train_scaled.shape)  # Should be (n_samples, n_features)
print(y_train.shape)         # Should be (n_samples,)


(80000, 33)
(80000,)


In [9]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_clf.fit(X_train_scaled, y_train)

print("Random Forest model trained successfully.")


Random Forest model trained successfully.


In [10]:
# Step 2: Define features (X) and target (y)
# - Dropping 'classification' (the target column) and 'hash' (non-informative) from features.
X = data.drop(columns=['classification', 'hash'])
# - Converting the 'classification' column into binary labels:
#   - 'malware' -> 1 (attack)
#   - Other classifications -> 0 (benign)
y = data['classification'].apply(lambda x: 1 if x == 'malware' else 0)


In [11]:
# Step 3: Scale the feature values for better model performance
# - Standardizing numerical features to have zero mean and unit variance.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# Step 4: Split the dataset into training and testing sets
# - 80% Training data, 20% Testing data
# - Random state ensures reproducibility
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [13]:
### **Model 1: Random Forest Classifier**

# Step 5: Train a RandomForestClassifier with regularization
# - max_depth: Limits tree depth to prevent overfitting
# - min_samples_split: Ensures a minimum number of samples before splitting
# - n_estimators: Number of decision trees in the forest
rf_classifier = RandomForestClassifier(max_depth=10, min_samples_split=20, n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [14]:
# Step 6: Make predictions on the test set
y_pred_rf = rf_classifier.predict(X_test)

In [15]:
# Step 7: Evaluate the Random Forest Model
print("Random Forest Model")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_rf) * 100:.2f}%")  # Display accuracy score
print("Classification Report (Test Set):")
print(classification_report(y_test, y_pred_rf))  # Precision, recall, f1-score
print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_pred_rf))  # Display confusion matrix


Random Forest Model
Test Accuracy: 100.00%
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10030
           1       1.00      1.00      1.00      9970

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Confusion Matrix (Test Set):
[[10030     0]
 [    0  9970]]


In [16]:
# Step 8: Perform Cross-Validation for RandomForestClassifier
cv_scores_rf = cross_val_score(rf_classifier, X_train, y_train, cv=5, scoring='accuracy')
print(f"\nCross-validation scores (RandomForestClassifier): {cv_scores_rf}")
print(f"Mean cross-validation score (RandomForestClassifier): {cv_scores_rf.mean():.4f}")


Cross-validation scores (RandomForestClassifier): [1.       0.999875 1.       1.       1.      ]
Mean cross-validation score (RandomForestClassifier): 1.0000


In [17]:
### **Model 2: Gradient Boosting Classifier**

# Step 9: Train a GradientBoostingClassifier
# - n_estimators: Number of boosting stages
# - max_depth: Controls tree depth
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
gb_classifier.fit(X_train, y_train)


In [18]:
# Step 10: Make predictions on the test set
y_pred_gb = gb_classifier.predict(X_test)

In [19]:
# Step 11: Evaluate the Gradient Boosting Model
print("\nGradient Boosting Model")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_gb) * 100:.2f}%")
print("Classification Report (Test Set):")
print(classification_report(y_test, y_pred_gb))
print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_pred_gb))


Gradient Boosting Model
Test Accuracy: 100.00%
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10030
           1       1.00      1.00      1.00      9970

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Confusion Matrix (Test Set):
[[10030     0]
 [    0  9970]]


In [20]:
# Step 12: Perform Cross-Validation for GradientBoostingClassifier
cv_scores_gb = cross_val_score(gb_classifier, X_train, y_train, cv=5, scoring='accuracy')
print(f"\nCross-validation scores (GradientBoostingClassifier): {cv_scores_gb}")
print(f"Mean cross-validation score (GradientBoostingClassifier): {cv_scores_gb.mean():.4f}")


Cross-validation scores (GradientBoostingClassifier): [1. 1. 1. 1. 1.]
Mean cross-validation score (GradientBoostingClassifier): 1.0000


In [21]:
### **Model 3: Support Vector Classifier (SVC)**

# Step 13: Train a Support Vector Classifier
# - Linear kernel is used for classification
svc_classifier = SVC(kernel='linear', random_state=42)
svc_classifier.fit(X_train, y_train)

In [22]:
# Step 14: Make predictions on the test set
y_pred_svc = svc_classifier.predict(X_test)

In [23]:
# Step 15: Evaluate the Support Vector Classifier Model
print("\nSupport Vector Classifier Model")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_svc) * 100:.2f}%")
print("Classification Report (Test Set):")
print(classification_report(y_test, y_pred_svc))
print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_pred_svc))


Support Vector Classifier Model
Test Accuracy: 94.78%
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     10030
           1       0.94      0.96      0.95      9970

    accuracy                           0.95     20000
   macro avg       0.95      0.95      0.95     20000
weighted avg       0.95      0.95      0.95     20000

Confusion Matrix (Test Set):
[[9397  633]
 [ 410 9560]]


In [None]:
# Step 16: Perform Cross-Validation for SVC
cv_scores_svc = cross_val_score(svc_classifier, X_train, y_train, cv=5, scoring='accuracy')
print(f"\nCross-validation scores (SVC): {cv_scores_svc}")
print(f"Mean cross-validation score (SVC): {cv_scores_svc.mean():.4f}")