Importing csv file and editing it to ahve another column  - experiment 1 below

In [1]:
import pandas as pd
import numpy as np
import csv

input_file_name = 'DryerPerformance_03.csv'  # Name of your input CSV file
output_file_name = 'DryerPerformance_03-output.csv'  # Name of the output CSV file with the additional column
column_name = 'Dryer1LbsPerHr'  # The column based on whose values you're adding a 1 or 0
new_column_name = 'RunningOptimally'  # Name of the new column to be added

# Read the input CSV file and write to the output CSV file with the additional column
with open(input_file_name, mode='r', newline='') as infile, open(output_file_name, mode='w', newline='') as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + [new_column_name]  # Add the new column name to the fieldnames
    
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for row in reader:
        # Check the condition for the specific column and set the value of the new column accordingly
        # Adjust the condition below according to your requirements
        if float(row[column_name]) > 30000:
            row[new_column_name] = '1'
        else:
            row[new_column_name] = '0'
        
        writer.writerow(row)



experiment 2 below

In [None]:
import os
import pandas as pd
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

# Load your dataset
file_path = 'DryerPerformance_03-output.csv'  # Update this to the path of your CSV file
data = pd.read_csv(file_path)

# If necessary, convert your date column to datetime and sort by date
data['DateTime'] = pd.to_datetime(data['DateTime'])  # Replace 'Date' with your actual date column name
data.sort_values('DateTime', inplace=True)

# Replace 'Value' with the name of the column you want to analyze for anomalies
column_name = 'Value'  # Update this to the name of your feature column

# Isolation Forest for anomaly detection
model = IsolationForest(n_estimators=100, contamination=0.02, random_state=42)
data['anomaly'] = model.fit_predict(data[[column_name]])

# Visualize the data along with the anomalies
plt.figure(figsize=(10, 6))
plt.plot(data['Date'], data[column_name], color='blue', label='Normal')
plt.scatter(data['Date'][data['anomaly'] == -1], data[column_name][data['anomaly'] == -1], color='red', label='Anomaly')
plt.xlabel('Date')
plt.ylabel(column_name)
plt.title('Time Series Anomaly Detection')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

import matplotlib.pyplot as plt

# Load your dataset
file_path = 'DryerPerformance_03-output.csv'  # Update this with your actual file path
data = pd.read_csv(file_path)

# Assuming 'DateTime' is your datetime column and should be sorted
data['DateTime'] = pd.to_datetime(data['DateTime'])
data.sort_values('DateTime', inplace=True)

# Exclude 'RunningOptimally' from features to analyze
features = data.columns.drop(['DateTime', 'RunningOptimally'])

# Dictionary to hold anomaly counts for each feature
anomaly_counts = {}

for feature in features:
    # Isolation Forest model
    model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
    # Fit model on the feature
    anomalies = model.fit_predict(data[[feature]])
    # Convert anomalies to a binary flag (1 for anomaly, 0 for normal)
    anomaly_flags = (anomalies == -1).astype(int)
    
    # Compare detected anomalies with 'RunningOptimally' to see alignment
    comparison = anomaly_flags == data['RunningOptimally']
    # Count how many times anomalies align with 'RunningOptimally'=0
    anomaly_counts[feature] = comparison.value_counts().get(False, 0)

# Sort features based on count of aligned anomalies
sorted_anomaly_counts = sorted(anomaly_counts.items(), key=lambda x: x[1], reverse=True)

# Print features ranked by their alignment with 'RunningOptimally'
for feature, count in sorted_anomaly_counts:
    print(f"Feature: {feature}, Anomaly Alignments: {count}")


experiment 3 below

In [12]:
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.inspection import permutation_importance

# Load your dataset
file_path = 'DryerPerformance_03-output.csv'  # Update this with your actual file path
data = pd.read_csv(file_path)

# Assuming 'DateTime' is your datetime column and should be sorted
data['DateTime'] = pd.to_datetime(data['DateTime'])
data.sort_values('DateTime', inplace=True)

# Prepare features and labels
X = data.drop(['DateTime', 'RunningOptimally'], axis=1)
y = data['RunningOptimally']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the HistGradientBoostingClassifier
model = HistGradientBoostingClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions))

# Compute permutation importance
perm_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)

# Get sorted feature importances
sorted_idx = perm_importance.importances_mean.argsort()[::-1]

# Print features ranked by their permutation importance
print("Feature ranking by permutation importance:")
for i in sorted_idx:
    print(f"{X.columns[i]}: {perm_importance.importances_mean[i]:.4f}")


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45474
           1       1.00      1.00      1.00    154526

    accuracy                           1.00    200000
   macro avg       1.00      1.00      1.00    200000
weighted avg       1.00      1.00      1.00    200000

Accuracy: 0.998035
Feature ranking by permutation importance:
Dryer1LbsPerHr: 0.3488
Sec2RetAirTemp: 0.0000
Sec3SupAirTemp: 0.0000
Zone2Sec5SteamSupPress: 0.0000
Zone1BeltSpeed_1hz: 0.0000
Sec6RetAirTemp: 0.0000
Sec6SupplyAirTemp: 0.0000
Zone1RetAirAvgTemp: 0.0000
Zone2SupAirAvgTemp: 0.0000
Sec9DeltaT: 0.0000
Zone3Sec9SteamSupPress: 0.0000
ThruputTotalShift: 0.0000
Zone2RetAirAvgTemp: 0.0000
Sec6SupplyAirSetpoint: 0.0000
Sec5DeltaT: 0.0000
Sec3DeltaT: 0.0000
Sec9RetAirTemp: 0.0000
Sec9SupplyAirTemp: 0.0000
Zone1SupAirAvgTemp: 0.0000
Sec4DeltaT: 0.0000
Sec1DeltaT: 0.0000
OutfeedMoisture: 0.0000
Sec6SteamSupplyContValve: 0.0000
Zone3Sec8SteamSupPress: 0.0000
Sec4Supp

experiment number 4 below

In [16]:
import pandas as pd
pd.options.display.max_rows = 4000

import numpy as np

# Load your dataset
df = pd.read_csv('DryerPerformance_03-output.csv')

# Filter rows where 'RunningOptimally' is 0
df_not_optimal = df[df['RunningOptimally'] == 0]

# Initialize a dictionary to hold the count of anomalies for each feature
anomalies_count = {}

# Define an anomaly threshold (e.g., 2 standard deviations from the mean)
threshold = 2

# Loop through each feature column (excluding 'DateTime' and 'RunningOptimally')
for column in df_not_optimal.columns.drop(['DateTime', 'RunningOptimally', 'Dryer1LbsPerHr']):
    # Calculate the mean and standard deviation for the column
    mean = df_not_optimal[column].mean()
    std = df_not_optimal[column].std()
    
    # Define what is considered an anomaly (outside of threshold standard deviations)
    lower_bound = mean - threshold * std
    upper_bound = mean + threshold * std
    
    # Count how many values fall outside of the bounds (anomalies)
    anomalies = df_not_optimal[(df_not_optimal[column] < lower_bound) | (df_not_optimal[column] > upper_bound)]
    anomalies_count[column] = len(anomalies)

# Sort the dictionary by the count of anomalies in descending order
sorted_anomalies = sorted(anomalies_count.items(), key=lambda x: x[1], reverse=True)

# Convert to DataFrame for better visualization (optional)
sorted_anomalies_df = pd.DataFrame(sorted_anomalies, columns=['Feature', 'AnomaliesCount'])

# Display the sorted list of features by anomalies count
print(sorted_anomalies_df)


                                  Feature  AnomaliesCount
0          Zone_3_Return_Chamber_Pressure           42785
1                       Zone1SteamSupFlow           35857
2                     Sec7SupplyAirTempSP           32409
3                  Sec7SupplyAirMpcTempSP           32409
4    Sec7BoosterCoilSupplyAirTempSetpoint           32403
5                     Zone1SupAirSetpoint           32223
6                   Zone1ConveyorSpeedRef           28925
7                      Zone1RetAirAvgTemp           25396
8                             WetBinLevel           25065
9                          Sec2RetAirTemp           24994
10                         Sec1RetAirTemp           24673
11               Sec9SteamSupplyContValve           23617
12                         Sec3RetAirTemp           23565
13                         MpcConstrValve           23133
14                      Sec6SupplyAirTemp           22771
15                         Sec1SupAirTemp           22753
16            

========================================================================================================================================================================

# Anomaly detection for SaaS metrics using Random Cut Forests (RCF)
***

## Introduction
Amazon SageMaker Random Cut Forest (RCF) is an unsupervised algorithm for detecting anomalous data points within a data set. These are observations which diverge from otherwise well-structured or patterned data. Anomalies can manifest as unexpected spikes in time series data, breaks in periodicity, or unclassifiable data points. They are easy to describe in that, when viewed in a plot, they are often easily distinguishable from the "regular" data. Including these anomalies in a data set can drastically increase the complexity of a machine learning task since the "regular" data can often be described with a simple model.

Find the documentation [here](https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html).

Deep dive video about RCF https://www.youtube.com/watch?v=9BWHR4JsTNU

This notebook is based on the ["Introduction to RCF" notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/random_cut_forest/random_cut_forest.ipynb). If you want more details on what is done here, feel free to check it out.



## Setup
*This notebook was tested in Amazon SageMaker Studio instance with Python 3 (conda_python3 kernel).*


First, ensure Sagemaker has read access to S3 and Athena.
Then adjust the following values:

In [1]:
# TODO: adjust s3_bucket and athena_db
s3_bucket = # "USERNAME-tenant-metrics" # an Amazon S3 bucket accessible by your account 
s3_prefix = "sagemaker/rcf"
athena_db = "\"tenants-metrics\".\"sample_data\"" #Adjust athena_db if needed 

SyntaxError: invalid syntax (619133805.py, line 2)

Adding import of system required libraries

In [2]:
import sys

# math
import numpy as np
import pandas as pd

# aws
import boto3
import botocore
#!conda install --yes --prefix {sys.prefix} PyAthena
!{sys.executable} -m pip install PyAthena
from pyathena import connect
import sagemaker
from sagemaker import RandomCutForest, Session, get_execution_role

# graphs
import matplotlib.pyplot as plt
%matplotlib inline

Defaulting to user installation because normal site-packages is not writeable
Collecting PyAthena
  Downloading pyathena-3.2.1-py3-none-any.whl.metadata (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading pyathena-3.2.1-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.6/80.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyAthena
Successfully installed PyAthena-3.2.1
[0m

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# initialize session
region = boto3.Session().region_name
sagemaker_session = sagemaker.Session()
execution_role = get_execution_role()

# define paths
s3_query_results = f's3://{s3_bucket}/{s3_prefix}/query-results/'

In [None]:
# check if the bucket exists
try:
    boto3.Session().client('s3').head_bucket(Bucket=s3_bucket)
except botocore.exceptions.ParamValidationError as e:
    print('Hey! You either forgot to specify your S3 bucket'
          ' or you gave your bucket an invalid name!')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '403':
        print("Hey! You don't have permission to access the bucket, {}.".format(s3_bucket))
    elif e.response['Error']['Code'] == '404':
        print("Hey! Your bucket, {}, doesn't exist!".format(s3_bucket))
    else:
        raise
else:
    print('Training input/output will be stored in: s3://{}/{}'.format(s3_bucket, s3_prefix))

## Querying S3 using Athena
The query gets the storage consumption per day of a certain tenant sorted by time. 
You can read more about querying S3 using Athena from Sagemaker over [here](https://aws.amazon.com/blogs/machine-learning/run-sql-queries-from-your-sagemaker-notebooks-using-amazon-athena/).

When plotting the data, it becomes clear that there are indeed some anomalies.

In [None]:
conn = connect(s3_staging_dir=s3_query_results, region_name=region)

# queries the storage consumption per day of tenant 3
query = f"""
WITH storage_consumption AS(
    SELECT
        date_trunc('day', from_unixtime(timestamp)) AS timestamp,
        metric.value                                AS storage
    FROM
        { athena_db }
    WHERE
        metric.name = 'Storage'
        AND
            tenant.id = 'tenant-id-3'
)

SELECT
    storage_consumption.timestamp,
    sum(storage)                   AS storage
FROM 
    storage_consumption
GROUP BY
    storage_consumption.timestamp
ORDER BY
    storage_consumption.timestamp ASC
"""
df = pd.read_sql(query, conn, index_col="timestamp", parse_dates="timestamp")
df = df[1:-1] # skip first and last day, because only half the day was monitored
df

In [None]:
df.plot();

## Setup Sagemaker and RCFs
Setup sagemaker to store the training releated data to s3.

In [None]:
session = Session()

# specify general training job information
rcf = RandomCutForest(role=execution_role,
                      instance_count=3,
                      instance_type='ml.m4.xlarge',
                      use_spot_instances=True,  # Use a spot instance 
                      max_run=300,  # Max training time
                      max_wait=600,  # Max training time + spot waiting time
                      data_location='s3://{}/{}/{}/'.format(s3_bucket, s3_prefix, "rcf"),
                      output_path='s3://{}/{}/{}/output'.format(s3_bucket, s3_prefix, "rcf"),
                      num_samples_per_tree=512,
                      num_trees=50)

## RCF Model Training
Train the RCF on our data and publish an endpoint. This will take around 15 minutes.

In [None]:
%%time

# convert the data to a numpy array
y = df.to_numpy()

# automatically upload the training data to S3 and run the training job
rcf.fit(rcf.record_set(y))
print('Training job name: {}'.format(rcf.latest_training_job.job_name))

In [None]:
%%time

print("Progress:")
rcf_inference = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
)
print('Endpoint name: {}'.format(rcf_inference.endpoint_name))

## Model Inference
Run the prediction through the endpoint with our data

In [None]:
%%time

results = rcf_inference.predict(y)
scores = [datum.label["score"].float32_tensor.values[0] for datum in results]

results = df.copy()
results["score"] = scores
results.head()

In [None]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

ax1.plot(results['storage'], color='C0', alpha=0.8)
ax2.plot(results['score'], color='C1')

ax1.grid(which='major', axis='both')

ax1.set_ylabel('Storage Consumption', color='C0')
ax2.set_ylabel('Anomaly Score', color='C1')

ax1.tick_params('y', colors='C0')
ax2.tick_params('y', colors='C1')

ax2.set_ylim(min(scores), 1.4 * max(scores))
fig.set_figwidth(16)

score_mean = results['score'].mean()
score_std = results['score'].std()
score_cutoff = score_mean + 4 * score_std # adjust constant to tweak sensitivity

anomalies = results[results['score'] > score_cutoff]
ax2.plot(anomalies.index, anomalies.score, 'ko');

In [None]:
anomalies

In [None]:
# don't forget to delete the inference endpoint
Session().delete_endpoint(rcf_inference.endpoint_name)