In [2]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 2. Define Paths
import os

# Base project directory in Drive
BASE_DIR = '/content/drive/MyDrive/spacecraft_anomaly_project'

# Paths
RAW_DIR = os.path.join(BASE_DIR, 'data', 'raw')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')

# Create folders if not present
os.makedirs(PROCESSED_DIR, exist_ok=True)

DATASET_PATH = os.path.join(RAW_DIR, 'dataset.csv')
SEGMENTS_PATH = os.path.join(RAW_DIR, 'segments.csv')  # For later use

In [4]:
# 3. Load dataset.csv
import pandas as pd

df = pd.read_csv(DATASET_PATH)
print(f"Loaded dataset.csv with shape: {df.shape}")
df.head()

Loaded dataset.csv with shape: (2123, 23)


Unnamed: 0,segment,anomaly,train,channel,sampling,duration,len,mean,var,std,...,smooth10_n_peaks,smooth20_n_peaks,diff_peaks,diff2_peaks,diff_var,diff2_var,gaps_squared,len_weighted,var_div_duration,var_div_len
0,1,1,1,CADC0872,1,279,280,8.533143e-07,3.494283e-10,1.9e-05,...,3,2,4,6,1.271176e-10,2.960666e-10,309,280,1.252431e-12,1.247958e-12
1,2,1,1,CADC0872,1,476,477,-3.639396e-06,6.476485e-10,2.5e-05,...,1,1,5,8,1.489383e-12,3.004752e-12,644,477,1.360606e-12,1.357754e-12
2,3,1,1,CADC0872,1,594,595,1.170788e-05,5.592877e-10,2.4e-05,...,2,2,2,3,4.11228e-12,1.029918e-11,772,595,9.415618e-13,9.399794e-13
3,4,1,1,CADC0872,1,271,272,8.486808e-07,5.466024e-10,2.3e-05,...,2,2,3,6,2.47576e-11,6.240985e-11,339,272,2.016983e-12,2.009568e-12
4,5,0,0,CADC0872,1,255,257,1.058485e-05,5.279023e-10,2.3e-05,...,1,1,78,87,5.547101e-13,7.035422e-13,357,257,2.070205e-12,2.054094e-12


In [5]:
# 4. Drop non-feature columns
import numpy as np
non_feature_cols = ['timestamp', 'anomaly'] if 'anomaly' in df.columns else ['timestamp']
features = df.drop(columns=non_feature_cols, errors='ignore')
# Keep only numeric columns for features
features = features.select_dtypes(include=[np.number])  # 💡 THIS LINE IS THE FIX

print(f"Using {features.shape[1]} numeric features")

print(f"Feature matrix shape: {features.shape}")

Using 21 numeric features
Feature matrix shape: (2123, 21)


In [12]:
# 5. Normalize the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Save the scaler (optional)
import joblib
joblib.dump(scaler, os.path.join(PROCESSED_DIR, 'scaler.pkl'))

print("Features normalized.")

Features normalized.


In [7]:
# 6. Extract anomaly labels (if present)
import numpy as np

anomalies = df['anomaly'].values if 'anomaly' in df.columns else None
if anomalies is not None:
    print(f"Anomalies present: {np.sum(anomalies)} ({np.mean(anomalies)*100:.2f}%)")

Anomalies present: 434 (20.44%)


In [8]:
# 7. Create sliding windows
def create_sliding_windows(X, window_size=30, stride=1, labels=None):
    X_windows = []
    label_windows = []

    for i in range(0, len(X) - window_size + 1, stride):
        window = X[i:i+window_size]
        X_windows.append(window)

        if labels is not None:
            label = 1 if np.any(labels[i:i+window_size]) else 0
            label_windows.append(label)

    X_windows = np.array(X_windows)
    label_windows = np.array(label_windows) if labels is not None else None
    return X_windows, label_windows

# Create windows
window_size = 30
stride = 1

X_windows, y_windows = create_sliding_windows(X_scaled, window_size, stride, anomalies)

print(f"Sliding window shape: {X_windows.shape}")
if y_windows is not None:
    print(f"Label shape: {y_windows.shape} | % Anomalous windows: {np.mean(y_windows)*100:.2f}%")

Sliding window shape: (2094, 30, 21)
Label shape: (2094,) | % Anomalous windows: 82.57%


In [9]:
# 8. Save preprocessed data
np.save(os.path.join(PROCESSED_DIR, 'X_windows.npy'), X_windows)
if y_windows is not None:
    np.save(os.path.join(PROCESSED_DIR, 'y_windows.npy'), y_windows)

print("✅ Preprocessed data saved to 'processed/' folder.")

✅ Preprocessed data saved to 'processed/' folder.


In [1]:
# Save the full scaled data (to be used later for anomaly detection)
np.save(os.path.join(PROCESSED_DIR, 'X_full.npy'), X_scaled)
print("✅ X_full.npy saved in processed/")

NameError: name 'np' is not defined