In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

In [92]:
df = pd.read_csv('biogas.csv')

In [93]:
df

Unnamed: 0,timestamp,ph,biogas_production,anomaly,cause
0,2024-01-01 08:00:00,6.87,34.81,1,
1,2024-01-01 10:00:00,7.45,50.21,1,
2,2024-01-01 12:00:00,7.23,35.38,1,
3,2024-01-01 14:00:00,7.10,50.82,1,
4,2024-01-01 16:00:00,6.66,36.89,1,
...,...,...,...,...,...
2995,2024-09-06 22:00:00,7.37,58.44,0,
2996,2024-09-07 00:00:00,6.66,37.76,0,
2997,2024-09-07 02:00:00,8.29,15.35,0,pH tinggi
2998,2024-09-07 04:00:00,6.79,42.07,0,


In [81]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

df['timestamp'] = pd.to_datetime(df['timestamp'])

df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.day
df['month'] = df['timestamp'].dt.month
df['day_of_week'] = df['timestamp'].dt.dayofweek

In [94]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.day
df['month'] = df['timestamp'].dt.month
df['day_of_week'] = df['timestamp'].dt.dayofweek

In [95]:
X = df[['ph', 'biogas_production', 'hour', 'day', 'month', 'day_of_week']]
y = df[['anomaly', 'cause']]

In [96]:
print("\nMissing values in X:", X.isna().sum())
print("Missing values in y:", y.isna().sum())


Missing values in X: ph                   0
biogas_production    0
hour                 0
day                  0
month                0
day_of_week          0
dtype: int64
Missing values in y: anomaly       0
cause      2700
dtype: int64


In [97]:
for column in X.columns:
    X[column] = X[column].fillna(X[column].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = X[column].fillna(X[column].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = X[column].fillna(X[column].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = X[column].fillna(X[column].median())
A value is trying to be set on a copy of a slice from a Da

In [98]:
rows_before = len(y)
non_na_indices = y.dropna().index
X = X.loc[non_na_indices]
y = y.loc[non_na_indices]
rows_after = len(y)
print(f"{rows_before - rows_after}")


2700


In [99]:
cause_encoder = LabelEncoder()
cause_encoded = cause_encoder.fit_transform(y['cause'])

In [100]:
cause_mapping = dict(zip(cause_encoder.classes_, cause_encoder.transform(cause_encoder.classes_)))
print("Cause mapping:", cause_mapping)

Cause mapping: {'Maintenance': 0, 'Produksi naik drastis': 1, 'Produksi turun': 2, 'pH rendah': 3, 'pH tinggi': 4}


In [101]:
y['cause'] = cause_encoded
print(y)

      anomaly  cause
16          1      4
60          1      1
72          1      0
89          1      4
91          1      4
...       ...    ...
2964        0      2
2972        0      3
2985        0      1
2990        0      4
2997        0      4

[300 rows x 2 columns]


In [102]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

In [104]:
model = LinearRegression()
model.fit(X_train, y_train)

In [105]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Training R² score: {train_score:.4f}")
print(f"Testing R² score: {test_score:.4f}")

Training R² score: 0.2508
Testing R² score: 0.1780


In [106]:
y_pred = model.predict(X_test)

In [107]:
threshold = 0.5
y_pred_anomaly = y_pred[:, 0]
y_pred_anomaly_binary = (y_pred_anomaly >= threshold).astype(int)

In [108]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy = accuracy_score(y_test['anomaly'], y_pred_anomaly_binary)
print(f"Anomaly detection accuracy (with 0.5 threshold): {accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test['anomaly'], y_pred_anomaly_binary))
print("Classification Report:")
print(classification_report(y_test['anomaly'], y_pred_anomaly_binary))

Anomaly detection accuracy (with 0.5 threshold): 0.9333
Confusion Matrix:
[[56  0]
 [ 4  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        56
           1       0.00      0.00      0.00         4

    accuracy                           0.93        60
   macro avg       0.47      0.50      0.48        60
weighted avg       0.87      0.93      0.90        60



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [109]:
y_pred_cause = np.clip(np.round(y_pred[:, 1]), 0, len(cause_encoder.classes_) - 1).astype(int)
cause_accuracy = accuracy_score(y_test['cause'], y_pred_cause)
print(f"\nCause prediction accuracy: {cause_accuracy:.4f}")


Cause prediction accuracy: 0.3500


In [110]:
model_package = {
    'model': model,
    'scaler': scaler,
    'cause_encoder': cause_encoder,
    'cause_mapping': cause_mapping,
    'feature_columns': X.columns.tolist()
}

In [111]:
joblib.dump(model_package, 'biogas_anomaly_model.pkl')

['biogas_anomaly_model.pkl']

In [112]:
print("Example predictions:")
for i in range(min(5, len(y_test))):
    true_anomaly = y_test['anomaly'].iloc[i]
    true_cause_id = y_test['cause'].iloc[i]
    true_cause = cause_encoder.inverse_transform([true_cause_id])[0]
    
    pred_anomaly = y_pred_anomaly_binary[i]
    pred_cause_id = y_pred_cause[i]
    pred_cause = cause_encoder.inverse_transform([pred_cause_id])[0]
    
    print(f"Sample {i+1}:")
    print(f"  True: Anomaly = {true_anomaly}, Cause = '{true_cause}'")
    print(f"  Pred: Anomaly = {pred_anomaly}, Cause = '{pred_cause}'")
    print("  --")

Example predictions:
Sample 1:
  True: Anomaly = 0, Cause = 'Produksi turun'
  Pred: Anomaly = 0, Cause = 'Produksi turun'
  --
Sample 2:
  True: Anomaly = 0, Cause = 'Maintenance'
  Pred: Anomaly = 0, Cause = 'Produksi turun'
  --
Sample 3:
  True: Anomaly = 0, Cause = 'pH rendah'
  Pred: Anomaly = 0, Cause = 'Produksi naik drastis'
  --
Sample 4:
  True: Anomaly = 1, Cause = 'Produksi turun'
  Pred: Anomaly = 0, Cause = 'Produksi turun'
  --
Sample 5:
  True: Anomaly = 0, Cause = 'pH tinggi'
  Pred: Anomaly = 0, Cause = 'pH rendah'
  --


In [113]:
# Add a cell for interactive authentication
from azureml.core.authentication import InteractiveLoginAuthentication

auth = InteractiveLoginAuthentication()
ws = Workspace.from_config(auth=auth)
print("Re-authenticated workspace connection")

Re-authenticated workspace connection


In [114]:
from azureml.core import Model

registered_model = Model.register(
    model_path="biogas_anomaly_model.pkl",
    model_name="model_biogas",
    workspace=ws
)
print("Model registered: ", registered_model.name)

Registering model model_biogas
Model registered:  model_biogas


In [115]:
from azure.ai.ml import MLClient
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment
from azure.identity import InteractiveBrowserCredential

credential = InteractiveBrowserCredential()

ml_client = MLClient(
    credential=credential,
    subscription_id="ca50b345-b6d7-4d97-be52-8847c2e0321e",
    resource_group_name="318",  
    workspace_name="bioserde_ml"
)

endpoint = ManagedOnlineEndpoint(
    name="biogas-endpoint",
    description="Biogas anomaly detection endpoint",
    auth_mode="key"
)

ml_client.online_endpoints.begin_create_or_update(endpoint).result()

Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


ManagedOnlineEndpoint({'public_network_access': 'Enabled', 'provisioning_state': 'Succeeded', 'scoring_uri': 'https://biogas-endpoint.southeastasia.inference.ml.azure.com/score', 'openapi_uri': 'https://biogas-endpoint.southeastasia.inference.ml.azure.com/swagger.json', 'name': 'biogas-endpoint', 'description': 'Biogas anomaly detection endpoint', 'tags': {}, 'properties': {'createdBy': 'JASON LEE', 'createdAt': '2025-05-29T13:18:40.329415+0000', 'lastModifiedAt': '2025-05-29T13:18:40.329415+0000', 'azureml.onlineendpointid': '/subscriptions/ca50b345-b6d7-4d97-be52-8847c2e0321e/resourcegroups/318/providers/microsoft.machinelearningservices/workspaces/bioserde_ml/onlineendpoints/biogas-endpoint', 'AzureAsyncOperationUri': 'https://management.azure.com/subscriptions/ca50b345-b6d7-4d97-be52-8847c2e0321e/providers/Microsoft.MachineLearningServices/locations/southeastasia/mfeOperationsStatus/oeidp:e8ae62c2-2570-49a1-b02e-0b782fcd2067:ace5f0cd-0e21-4a34-81d9-ee87a602a732?api-version=2022-02-

In [116]:
from azure.ai.ml import MLClient
from azure.identity import InteractiveBrowserCredential

# Authenticate
credential = InteractiveBrowserCredential()
ml_client = MLClient(
    credential=credential,
    subscription_id="ca50b345-b6d7-4d97-be52-8847c2e0321e",
    resource_group_name="318",
    workspace_name="bioserde_ml"
)

# Get endpoint
endpoint = ml_client.online_endpoints.get("biogas-endpoint")

# Get API keys
keys = ml_client.online_endpoints.get_keys("biogas-endpoint")
print("Primary Key:", keys.primary_key)
print("Secondary Key:", keys.secondary_key)

Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


Primary Key: A9fEmlMO1EKMBhyNQaeKYgJ2WoCm74lV4RqFIijf2231pWBD8JRIJQQJ99BEAAAAAAAAAAAAINFRAZML1c7h
Secondary Key: 2oIPpRD7Jyi0elXjBZvI209fNJCQDoPnQEaAOkPwy2nbvvjdhdtBJQQJ99BEAAAAAAAAAAAAINFRAZML1Vh7
