In [5]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import IsolationForest

# Synthetic Data Settings
duration_minutes = 300  # Extended to create over 1GB dataset
sampling_rate_hz = 10  # Samples per second
num_samples = duration_minutes * 60 * sampling_rate_hz

# Initialize Data
timestamps = pd.date_range(start="2024-01-01", periods=num_samples, freq="100ms")
data = {
    "cpu_temperature": np.random.normal(loc=50, scale=5, size=num_samples),
    "cpu_usage": np.random.uniform(10, 50, size=num_samples),
    "cpu_load": np.random.uniform(0.1, 1.0, size=num_samples),
    "memory_usage": np.random.uniform(30, 70, size=num_samples),
    "battery_level": np.random.uniform(30, 100, size=num_samples),
    "cpu_power": np.random.uniform(10, 50, size=num_samples),
}

# Introduce anomalies randomly
for i in range(num_samples):
    if random.random() < 0.1:  # anomaly probability
            # CPU Usage
        if random.random() < 0.5: # CPU Usage  
            data["cpu_usage"][i] = random.uniform(90, 100) 
            # Temperature
        if random.random() < 0.3:
            data["cpu_temperature"][i] = random.uniform(90, 105) 
            # Memory Usage
        if random.random() < 0.2:
            data["memory_usage"][i] = random.uniform(95, 100)  
            # Battery Level 
        if random.random() < 0.1:
            data["battery_level"][i] = random.uniform(0, 10)  
            # CPU Power
        if random.random() < 0.1:
            data["cpu_power"][i] = random.uniform(50, 100)  

# Create DataFrame
df = pd.DataFrame(data)
df.insert(0, "timestamp", timestamps)

# Save dataset to CSV
dataset_path = "synthetic_timeseries_assignment.csv"
df.to_csv(dataset_path, index=False)

# Apply Isolation Forest for anomaly detection
features = ["cpu_temperature", "cpu_usage", "cpu_load", "memory_usage", "battery_level", "cpu_power"]
iso_forest = IsolationForest(contamination=0.1, random_state=42)
df["anomaly"] = iso_forest.fit_predict(df[features])

# Save detected anomalies to a separate file
anomalies = df[df["anomaly"] == -1]
anomaly_path = "anomalies_detected.csv"
anomalies.to_csv(anomaly_path, index=False)

# Outputs
dataset_path, anomaly_path, len(anomalies), df.head(), anomalies.head()

('synthetic_timeseries_assignment.csv',
 'anomalies_detected.csv',
 18000,
                 timestamp  cpu_temperature  cpu_usage  cpu_load  memory_usage  \
 0 2024-01-01 00:00:00.000        42.086591  39.589476  0.407096     52.973650   
 1 2024-01-01 00:00:00.100        55.543377  43.421057  0.349086     32.325370   
 2 2024-01-01 00:00:00.200        57.078497  35.868730  0.265598     54.915142   
 3 2024-01-01 00:00:00.300        52.557258  29.405556  0.710548     55.842329   
 4 2024-01-01 00:00:00.400        55.992820  19.736890  0.341556     68.207572   
 
    battery_level  cpu_power  anomaly  
 0      95.424655  40.085416        1  
 1      39.392436  40.562987        1  
 2      67.662123  23.726054        1  
 3      30.414012  49.487910        1  
 4      34.392663  24.420063        1  ,
                  timestamp  cpu_temperature  cpu_usage  cpu_load  \
 9  2024-01-01 00:00:00.900        47.917220  99.114127  0.293928   
 17 2024-01-01 00:00:01.700        52.128580  97.183