#Installaing Pytorch#


In [None]:
!pip install torch torchvision torchaudio




#Loading The Data set


In [None]:
import pandas as pd

# Load the dataset
file_path = "/content/Obfuscated-MalMem2022.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,...,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class
0,Benign,45,17,10.555556,0,202.844444,1694,38.5,9129,212.302326,...,221,26,24,116,0,121,87,0,8,Benign
1,Benign,47,19,11.531915,0,242.234043,2074,44.12766,11385,242.234043,...,222,26,24,118,0,122,87,0,8,Benign
2,Benign,40,14,14.725,0,288.225,1932,48.3,11529,288.225,...,222,26,27,118,0,120,88,0,8,Benign
3,Benign,32,13,13.5,0,264.28125,1445,45.15625,8457,264.28125,...,222,26,27,118,0,120,88,0,8,Benign
4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,...,222,26,24,118,0,124,87,0,8,Benign


Step 1: Data Preprocessing
First, we need to clean and prepare the data. Here's the code to handle missing values and encode the target variable (Class).

In [None]:
# Handle missing values (if any)
data = data.dropna()

# Encode the target variable (Class) as numerical labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Class'] = label_encoder.fit_transform(data['Class'])

# Separate the features (X) and target (y)
X = data.drop(columns=['Class'])

# Check column types to see if there are any non-numeric columns
print("Column Types Before Conversion:")
print(X.dtypes)

# Select only numeric columns
X_numeric = X.select_dtypes(include=['float64', 'int64'])

# Check how many numeric columns we have
print("Numeric Columns Only:")
print(X_numeric.dtypes)

# Check for any missing values in the numeric columns
print("Missing values in numeric columns:")
print(X_numeric.isnull().sum())

# Drop any rows with missing values in the numeric columns
X_numeric = X_numeric.dropna()

# Re-create the target variable 'y' for the cleaned X_numeric
y = data.loc[X_numeric.index, 'Class']  # Ensure y corresponds to the cleaned X_numeric

# Normalize the numeric features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Verify the shape of the scaled data
print(f"Shape of scaled features: {X_scaled.shape}")

Column Types Before Conversion:
Category                                   object
pslist.nproc                                int64
pslist.nppid                                int64
pslist.avg_threads                        float64
pslist.nprocs64bit                          int64
pslist.avg_handlers                       float64
dlllist.ndlls                               int64
dlllist.avg_dlls_per_proc                 float64
handles.nhandles                            int64
handles.avg_handles_per_proc              float64
handles.nport                               int64
handles.nfile                               int64
handles.nevent                              int64
handles.ndesktop                            int64
handles.nkey                                int64
handles.nthread                             int64
handles.ndirectory                          int64
handles.nsemaphore                          int64
handles.ntimer                              int64
handles.nsection  

Step 2: Train-Test Split
Next, we'll split the data into a training and testing set using train_test_split from sklearn.

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


Step 3: Build the Hybrid CNN + LSTM Model
Now, we’ll define the hybrid model using PyTorch. The idea is to combine CNN for feature extraction and LSTM for sequential learning.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Hybrid CNN + LSTM Model
class HybridCNNLSTM(nn.Module):
    def __init__(self, input_size, lstm_hidden_size, num_classes):
        super(HybridCNNLSTM, self).__init__()

        # Convolutional layers for feature extraction
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)

        # LSTM layer for temporal learning
        self.lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden_size, batch_first=True)

        # Fully connected layer to output the result
        self.fc = nn.Linear(lstm_hidden_size, num_classes)

    def forward(self, x):
        # Apply convolutional layers
        x = x.unsqueeze(1)  # Add a channel dimension (needed for CNN)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))

        # Reshape the output from CNN for LSTM
        x = x.permute(0, 2, 1)  # Switch dimensions for LSTM (batch_size, sequence_length, input_size)

        # Apply LSTM layer
        _, (h_n, _) = self.lstm(x)

        # Pass through the fully connected layer to get the final output
        x = self.fc(h_n[-1])  # We take the output of the last LSTM cell

        return x

# Instantiate the model
model = HybridCNNLSTM(input_size=X_train.shape[1], lstm_hidden_size=64, num_classes=len(label_encoder.classes_))


Step 4: Train the Model
Next, let's define the loss function and optimizer, and then train the model.

In [None]:
# Step 4: Train the Model

#import gc

# Explicitly collect garbage during training
#gc.collect()

from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create TensorDatasets for train and test
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Define the batch size (reduce this value to reduce memory usage)
batch_size = 32  # You can try changing this value to 32 or 64 depending on memory

# Create DataLoaders to load the data in batches
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Learning rate decay settings
decay_factor = 0.95  # Learning rate decay per epoch
learning_rates = []  # Store learning rate values for visualization

# Training the model with batching
num_epochs = 20
for epoch in range(num_epochs):
    model.train()

    # Loop through each batch of data
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    # Get and store the current learning rate
    current_lr = optimizer.param_groups[0]['lr']
    learning_rates.append(current_lr)

    # Decay learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] *= decay_factor

    # Print epoch, loss, and learning rate
    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Learning Rate: {current_lr:.6f}')

# Plot learning rate over epochs
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(range(1, num_epochs + 1), learning_rates, marker='o', linestyle='-', color='blue', label="Learning Rate")
plt.xlabel("Epochs")
plt.ylabel("Learning Rate")
plt.title("Learning Rate Over Training Epochs")
plt.legend()
plt.grid(True)
plt.show()


KeyboardInterrupt: 

Step 5: Saving The  model

In [None]:
# Save the trained model
torch.save(model.state_dict(), "/content/Model")
print("Model saved as 'hybrid_cnn_lstm_model.pth'")


Model saved as 'hybrid_cnn_lstm_model.pth'


Step 6: Load The Model

In [None]:
# Re-initialize the model (same architecture as before)
model = HybridCNNLSTM(input_size=X_train.shape[1], lstm_hidden_size=64, num_classes=len(label_encoder.classes_))

# Load the model's state dict (trained parameters)
model.load_state_dict(torch.load("/content/Model.dec"))

# Set the model to evaluation mode (important for inference)
model.eval()
print("Model loaded and ready for evaluation.")


Model loaded and ready for evaluation.


Step 7: Load The Model

In [None]:
# Re-initialize the model (same architecture as before)
model = HybridCNNLSTM(input_size=X_train.shape[1], lstm_hidden_size=64, num_classes=len(label_encoder.classes_))
# Load the model's state dict (trained parameters)
model.load_state_dict(torch.load("/content/Model.dec"))
print("Model is Loaded")



Model is Loaded


Step 8: Evaluate the Model
Finally, after training the model, we will evaluate its performance on the test data.

In [None]:
from torch.utils.data import DataLoader, TensorDataset
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 32
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



# Evaluate the model's accuracy on the test set
correct = 0
total = 0

# No gradients needed for inference
with torch.no_grad():
    # Loop through test data
    for batch_X, batch_y in test_loader:
        # Forward pass
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)

        # Update correct and total counts
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

# Calculate accuracy
accuracy = 100 * correct / total
print(f"Accuracy on test set: {accuracy:.2f}%")


Accuracy on test set: 99.94%
