# Implementation: Collect Process Data


In [3]:
# 📌 Step 1: Install dependencies (run this first if needed)
!pip install psutil pandas scikit-learn joblib




In [4]:
# 📌 Step 2: Import Libraries
import psutil
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
from queue import Queue
import time
import threading
import os


In [5]:
# 📌 Step 3: Collect System Process Data
def get_process_data():
    process_list = []
    
    for proc in psutil.process_iter(['pid', 'cpu_times', 'memory_info', 'nice']):
        try:
            info = proc.info
            process_list.append({
                "pid": info['pid'],
                "utime": info['cpu_times'].user if info['cpu_times'] else 0,  # User CPU time
                "stime": info['cpu_times'].system if info['cpu_times'] else 0,  # System CPU time
                "priority": info['nice'],  # Process priority
                "mem_usage": info['memory_info'].rss if info['memory_info'] else 0  # Memory usage
            })
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            continue
    
    return pd.DataFrame(process_list)

# Save Data
df = get_process_data()
df.to_csv("windows_process_data.csv", index=False)
df.head()


Unnamed: 0,pid,utime,stime,priority,mem_usage
0,0,0.0,0.0,0,0
1,1,0.0,0.0,0,0
2,296,0.0,0.0,0,0
3,298,0.0,0.0,0,0
4,300,0.0,0.0,0,0


In [6]:
# 📌 Step 4: Train an AI Model for Process Classification
df["process_type"] = df["utime"] > df["stime"]  # Label: True = CPU-bound, False = I/O-bound

X = df[["utime", "stime", "priority", "mem_usage"]]
y = df["process_type"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train Model
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

# Save Model
joblib.dump(model, "process_classifier.pkl")

print("✅ Model trained and saved as process_classifier.pkl")


✅ Model trained and saved as process_classifier.pkl


In [None]:
import threading
import time
import psutil
import pandas as pd
import joblib
from queue import Queue

# Load trained ML model
model = joblib.load("process_classifier.pkl")

# Define Queues
cpu_queue = Queue()
io_queue = Queue()
processed_pids = set()  # Track processed PIDs

# Global Flag to Stop Execution
running = True  

def classify_and_enqueue():
    """ Classifies and enqueues new processes only if not already in the queue. """
    global processed_pids, running
    while running:  # Check stop condition
        for proc in psutil.process_iter(['pid']):
            if not running:
                break
            try:
                pid = proc.info['pid']
                if pid in processed_pids:
                    continue

                # Collect process data
                process_data = pd.DataFrame([{
                    "utime": proc.cpu_times().user if proc.cpu_times() else 0,
                    "stime": proc.cpu_times().system if proc.cpu_times() else 0,
                    "priority": proc.nice(),
                    "mem_usage": proc.memory_info().rss if proc.memory_info() else 0
                }])

                # Predict process type
                prediction = model.predict(process_data)[0]

                # Enqueue based on type
                if prediction:
                    cpu_queue.put(pid)
                else:
                    io_queue.put(pid)

                # Mark as processed
                processed_pids.add(pid)

            except (psutil.NoSuchProcess, psutil.AccessDenied):
                continue
        time.sleep(5)

def process_queue(q, queue_type, execution_time=2):
    """ Worker function to process and remove tasks from queue """
    global processed_pids, running
    while running:
        if not q.empty():
            pid = q.get()
            if not psutil.pid_exists(pid):
                print(f"Skipping {queue_type} task (PID {pid} no longer exists)")
                continue

            print(f"Processing {queue_type} task: PID {pid}")
            time.sleep(execution_time)
            processed_pids.discard(pid)
            q.task_done()
            print(f"Completed {queue_type} task: PID {pid}")
        else:
            time.sleep(1)

# Start worker threads
cpu_worker = threading.Thread(target=process_queue, args=(cpu_queue, "CPU-bound", 2), daemon=True)
io_worker = threading.Thread(target=process_queue, args=(io_queue, "I/O-bound", 1), daemon=True)
cpu_worker.start()
io_worker.start()

# Start classification thread
classifier_thread = threading.Thread(target=classify_and_enqueue, daemon=True)
classifier_thread.start()

# Run for a fixed time and then stop
try:
    while True:
        print(f"CPU Queue Size: {cpu_queue.qsize()} | IO Queue Size: {io_queue.qsize()}")
        time.sleep(5)
except KeyboardInterrupt:
    print("\nStopping program...")
    running = False  # Signal threads to stop
    classifier_thread.join()
    cpu_worker.join()
    io_worker.join()
    print("All threads stopped.")


CPU Queue Size: 0 | IO Queue Size: 0
Processing I/O-bound task: PID 529Processing CPU-bound task: PID 367

Completed I/O-bound task: PID 529
Processing I/O-bound task: PID 557
Completed CPU-bound task: PID 367
Processing CPU-bound task: PID 528
Completed I/O-bound task: PID 557
Processing I/O-bound task: PID 574
Completed I/O-bound task: PID 574
Processing I/O-bound task: PID 633
CPU Queue Size: 341 | IO Queue Size: 47
Completed CPU-bound task: PID 528
Processing CPU-bound task: PID 535
Completed I/O-bound task: PID 633
Processing I/O-bound task: PID 644
Completed I/O-bound task: PID 644
Processing I/O-bound task: PID 654


## Perform metrics 


In [None]:
import psutil
import pandas as pd

def get_windows_process_data():
    """ Collects process data from Windows for evaluation. """
    process_list = []
    
    for proc in psutil.process_iter(attrs=['pid', 'cpu_times', 'memory_info', 'nice']):
        try:
            info = proc.info
            process_list.append({
                "pid": info['pid'],
                "utime": info['cpu_times'].user if info['cpu_times'] else 0,  # User mode CPU time
                "stime": info['cpu_times'].system if info['cpu_times'] else 0,  # Kernel mode CPU time
                "priority": info['nice'],  # Process priority
                "mem_usage": info['memory_info'].rss if info['memory_info'] else 0  # Memory usage in bytes
            })
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            continue

    return pd.DataFrame(process_list)

# Get real-time Windows process data
df_test = get_windows_process_data()
df_test.to_csv("windows_test_data2.csv", index=False)  # Save for reuse


In [None]:
import joblib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load trained classifier
model = joblib.load("process_classifier.pkl")

# Load test data
df_test = pd.read_csv("windows_process_data.csv")

# Drop the 'pid' column (since it's not a feature)
X_test = df_test.drop(columns=["pid"])



In [None]:
y_pred = model.predict(X_test)
df_test["predicted_class"] = y_pred
print(df_test.head())  # Show some predictions


In [None]:
# Generate pseudo ground truth (approximate classification)
df_test["true_label"] = df_test.apply(lambda row: 1 if row["utime"] + row["stime"] > 0.5 else 0, axis=1)

# Compare model vs pseudo ground truth
y_test = df_test["true_label"]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["I/O-Bound", "CPU-Bound"], yticklabels=["I/O-Bound", "CPU-Bound"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (Windows Process Classification)")
plt.show()
