Author: Amitabh Chakravorty

CLOUD CRYPTOJACKING AI DETECTION - VALIDATION EXPERIMENT

Master Notebook for Complete Workflow

Purpose: Empirical validation for SLR paper

In [7]:
# ============================================================================
# SECTION 1: ENVIRONMENT SETUP
# ============================================================================

print("="*70)
print("CRYPTOJACKING VALIDATION EXPERIMENT - MASTER NOTEBOOK")
print("="*70)

print("\n[1/6] Installing required packages")
!pip install -q pandas numpy scikit-learn xgboost lightgbm tensorflow matplotlib seaborn imbalanced-learn kaggle openpyxl

import os
import sys
import time
import glob
import pickle
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report
)

from google.colab import files, drive

warnings.filterwarnings('ignore')
print("Packages installed and imported successfully")


CRYPTOJACKING VALIDATION EXPERIMENT - MASTER NOTEBOOK

[1/6] Installing required packages
Packages installed and imported successfully


In [8]:
# ============================================================================
# SECTION 2: GOOGLE DRIVE INTEGRATION
# ============================================================================

print("\n[2/6] Setting up Google Drive integration")

# Mount Google Drive
drive_mounted = False
try:
    drive.mount('/content/drive', force_remount=True)
    drive_mounted = True
    print("Google Drive mounted successfully")
except Exception as e:
    print(f"Drive mounting failed - will use local Colab storage. Details: {e}")

# Base path (Drive if mounted, else local)
base_path = '/content/drive/MyDrive/cryptojacking_validation' if drive_mounted else '/content/cryptojacking_validation'

# Create project directory structure (including missing 'results/')
dirs = [
    f'{base_path}/data/raw',
    f'{base_path}/data/processed',
    f'{base_path}/models',
    f'{base_path}/results',
    f'{base_path}/results/metrics',
    f'{base_path}/results/figures',
    f'{base_path}/scripts',
]
for d in dirs:
    os.makedirs(d, exist_ok=True)

print(f"Project structure created at: {base_path}")

# Set working directory
os.chdir(base_path)
print(f"Working directory: {os.getcwd()}")



[2/6] Setting up Google Drive integration
Mounted at /content/drive
Google Drive mounted successfully
Project structure created at: /content/drive/MyDrive/cryptojacking_validation
Working directory: /content/drive/MyDrive/cryptojacking_validation


In [9]:
# ============================================================================
# SECTION 3: KAGGLE API SETUP (API KEY INPUT MODE)
# ============================================================================

print("\n[3/6] Setting up Kaggle API using direct credentials")

# Prompt user for Kaggle credentials
kaggle_username = input("Enter your Kaggle username: ").strip()
kaggle_key = input("Enter your Kaggle API key: ").strip()

if not kaggle_username or not kaggle_key:
    raise ValueError("Kaggle username and API key must not be empty.")

# Create Kaggle config directory
os.makedirs('/root/.kaggle', exist_ok=True)

# Write kaggle.json
kaggle_json_path = '/root/.kaggle/kaggle.json'
with open(kaggle_json_path, 'w') as f:
    f.write('{\n')
    f.write(f'  "username": "{kaggle_username}",\n')
    f.write(f'  "key": "{kaggle_key}"\n')
    f.write('}\n')

# Set permissions
os.chmod(kaggle_json_path, 0o600)

print("Kaggle credentials configured successfully")

# Verify Kaggle access
print("\nVerifying Kaggle API access")
!kaggle datasets list | head -5



[3/6] Setting up Kaggle API using direct credentials
Enter your Kaggle username: [Username]
Enter your Kaggle API key: [API Key]
Kaggle credentials configured successfully

Verifying Kaggle API access
ref                                                               title                                                     size  lastUpdated                 downloadCount  voteCount  usabilityRating  
----------------------------------------------------------------  --------------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
neurocipher/heartdisease                                          Heart Disease                                             3491  2025-12-11 15:29:14.327000           2114        246  1.0              
kundanbedmutha/exam-score-prediction-dataset                      Exam Score Prediction Dataset                           325454  2025-11-28 07:29:01.047000           5863        283  1.0     

In [10]:
# ============================================================================
# SECTION 4: DATASET DOWNLOAD
# ============================================================================

print("\n[4/6] Downloading datasets")
print("Note: download time depends on dataset sizes and Colab speed.\n")

datasets = {
    'ds2os': 'libamariyam/ds2os-dataset',
    'nsl_kdd': 'hassan06/nslkdd',
    # 'edge_iiot': 'mohamedamineferrag/edgeiiotset-cyber-security-dataset-'  # large; enable if needed
}

for name, dataset_id in datasets.items():
    print(f"  Downloading {name} ({dataset_id}) ")
    download_path = f'{base_path}/data/raw/{name}'
    os.makedirs(download_path, exist_ok=True)

    # Kaggle CLI will exit nonzero on errors; Colab still continues, so we check files after.
    !kaggle datasets download -d {dataset_id} -p "{download_path}" --unzip

    # Quick sanity check
    n_files = len(list(glob.glob(os.path.join(download_path, "**", "*"), recursive=True)))
    if n_files > 0:
        print(f"  {name} downloaded/unzipped. Items found: {n_files}")
    else:
        print(f"  {name} looks empty. Check Kaggle dataset id or permissions.")

print("\n Dataset download step complete")

# Verify downloads (count + sample)
print("\nDataset file check:")
for name in datasets.keys():
    path = f'{base_path}/data/raw/{name}'
    if os.path.exists(path):
        all_items = [p for p in glob.glob(os.path.join(path, "**", "*"), recursive=True) if os.path.isfile(p)]
        print(f"  {name}: {len(all_items)} files")
        print(f"    Sample: {[os.path.basename(x) for x in all_items[:3]]}")


[4/6] Downloading datasets
Note: download time depends on dataset sizes and Colab speed.

  Downloading ds2os (libamariyam/ds2os-dataset) 
Dataset URL: https://www.kaggle.com/datasets/libamariyam/ds2os-dataset
License(s): unknown
Downloading ds2os-dataset.zip to /content/drive/MyDrive/cryptojacking_validation/data/raw/ds2os
  0% 0.00/4.22M [00:00<?, ?B/s]
100% 4.22M/4.22M [00:00<00:00, 63.7MB/s]
  ds2os downloaded/unzipped. Items found: 1
  Downloading nsl_kdd (hassan06/nslkdd) 
Dataset URL: https://www.kaggle.com/datasets/hassan06/nslkdd
License(s): unknown
Downloading nslkdd.zip to /content/drive/MyDrive/cryptojacking_validation/data/raw/nsl_kdd
  0% 0.00/13.9M [00:00<?, ?B/s]
100% 13.9M/13.9M [00:00<00:00, 170MB/s]
  nsl_kdd downloaded/unzipped. Items found: 23

 Dataset download step complete

Dataset file check:
  ds2os: 1 files
    Sample: ['DS2OS.csv']
  nsl_kdd: 22 files
    Sample: ['KDDTest+.arff', 'KDDTest+.txt', 'KDDTest-21.arff']


In [11]:
# ============================================================================
# SECTION 5: QUICK DATA EXPLORATION
# ============================================================================

print("\n[5/6] Quick data exploration")

def explore_dataset(file_path: str, dataset_name: str, nrows: int = 1000):
    """Quick exploration of a dataset CSV."""
    print("\n")
    print(f"Dataset: {dataset_name}")
    print(f"File: {file_path}")
    print("\n")

    try:
        df = None
        for encoding in ['utf-8', 'latin-1', 'iso-8859-1']:
            try:
                df = pd.read_csv(file_path, encoding=encoding, nrows=nrows)
                break
            except Exception:
                continue

        if df is None:
            raise ValueError("Failed to load CSV with common encodings.")

        print(f"Shape (sampled nrows={nrows}): {df.shape}")
        print(f"\nColumns ({len(df.columns)}):")
        print(df.columns.tolist())

        print("\nFirst 3 rows:")
        display(df.head(3))

        print("\nData types (counts):")
        print(df.dtypes.value_counts())

        print("\nMissing values (in sample):")
        missing = df.isnull().sum()
        if missing.sum() > 0:
            display(missing[missing > 0].sort_values(ascending=False))
        else:
            print("No missing values detected in sample")

        # Identify potential label/target columns
        keywords = ['label', 'class', 'target', 'attack', 'type', 'category', 'outcome']
        potential_targets = [col for col in df.columns if any(k in col.lower() for k in keywords)]
        if potential_targets:
            print(f"\nPotential target columns: {potential_targets}")
            for col in potential_targets[:2]:
                print(f"\n'{col}' distribution (sample):")
                display(df[col].value_counts(dropna=False).head(20))
        else:
            print("\nNo obvious target column detected from column names (sample).")

        return df

    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None


def find_csvs(root: str):
    """Recursively find CSV files under a root directory."""
    return glob.glob(os.path.join(root, "**", "*.csv"), recursive=True)


# Explore DS2OS
ds2os_root = f'{base_path}/data/raw/ds2os'
ds2os_csvs = find_csvs(ds2os_root)
if ds2os_csvs:
    df_ds2os = explore_dataset(ds2os_csvs[0], 'DS2OS')
else:
    print("No CSV found for DS2OS. Inspect the download folder manually.")

# Explore NSL-KDD
nsl_root = f'{base_path}/data/raw/nsl_kdd'
nsl_csvs = find_csvs(nsl_root)
if nsl_csvs:
    train_candidates = [f for f in nsl_csvs if 'train' in os.path.basename(f).lower()]
    target_file = train_candidates[0] if train_candidates else nsl_csvs[0]
    df_nsl = explore_dataset(target_file, 'NSL-KDD')
else:
    print("No CSV found for NSL-KDD. Inspect the download folder manually.")


[5/6] Quick data exploration


Dataset: DS2OS
File: /content/drive/MyDrive/cryptojacking_validation/data/raw/ds2os/DS2OS.csv


Shape (sampled nrows=1000): (1000, 13)

Columns (13):
['sourceID', 'sourceAddress', 'sourceType', 'sourceLocation', 'destinationServiceAddress', 'destinationServiceType', 'destinationLocation', 'accessedNodeAddress', 'accessedNodeType', 'operation', 'value', 'timestamp', 'normality']

First 3 rows:


Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,value,timestamp,normality
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,none,1520031600000,normal
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,none,1520031603269,normal
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,none,1520031603279,normal



Data types (counts):
object    12
int64      1
Name: count, dtype: int64

Missing values (in sample):
No missing values detected in sample

Potential target columns: ['sourceType', 'destinationServiceType', 'accessedNodeType']

'sourceType' distribution (sample):


Unnamed: 0_level_0,count
sourceType,Unnamed: 1_level_1
/lightControler,348
/sensorService,311
/batteryService,155
/washingService,116
/movementSensor,29
/thermostat,21
/doorLockService,20



'destinationServiceType' distribution (sample):


Unnamed: 0_level_0,count
destinationServiceType,Unnamed: 1_level_1
/movementSensor,352
/sensorService,324
/batteryService,268
/lightControler,25
/doorLockService,20
/thermostat,8
/washingService,3


No CSV found for NSL-KDD. Inspect the download folder manually.


In [12]:
# ============================================================================
# SECTION 6: SAVE PROGRESS
# ============================================================================

print("\n[6/6] Saving setup information")

setup_info = {
    'timestamp': str(pd.Timestamp.now()),
    'datasets_downloaded': list(datasets.keys()),
    'base_path': base_path,
    'python_version': sys.version.replace("\n", " "),
}

setup_path = os.path.join(base_path, 'results', 'setup_info.txt')
with open(setup_path, 'w') as f:
    for key, value in setup_info.items():
        f.write(f"{key}: {value}\n")

print(f"Setup information saved to: {setup_path}")

print("\n")
print("SETUP COMPLETE!")
print("\n")
print("\nNext steps:")
print("1. Review the exploration output above")
print("2. Identify target columns for each dataset (set a TARGET_MAP dict)")
print("3. Run your preprocessing notebook (01_Preprocessing.ipynb)")
print("\nAll data saved to:", base_path)


[6/6] Saving setup information
Setup information saved to: /content/drive/MyDrive/cryptojacking_validation/results/setup_info.txt


SETUP COMPLETE!



Next steps:
1. Review the exploration output above
2. Identify target columns for each dataset (set a TARGET_MAP dict)
3. Run your preprocessing notebook (01_Preprocessing.ipynb)

All data saved to: /content/drive/MyDrive/cryptojacking_validation
