In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ana-verse-2-0-j/train.parquet
/kaggle/input/ana-verse-2-0-j/sample_submission.parquet
/kaggle/input/ana-verse-2-0-j/test.parquet


The objective of this problem is to predict anomalies based on sensor readings 
collected from an energy manufacturing plant.

In [2]:
import pandas as pd

train = pd.read_parquet('/kaggle/input/ana-verse-2-0-j/train.parquet')
test = pd.read_parquet('/kaggle/input/ana-verse-2-0-j/test.parquet')

train.head(), train.info(), train['target'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1639424 entries, 0 to 1639423
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   Date    1639424 non-null  datetime64[ns]
 1   X1      1639424 non-null  float64       
 2   X2      1639424 non-null  float64       
 3   X3      1639424 non-null  float64       
 4   X4      1639424 non-null  float64       
 5   X5      1639424 non-null  float64       
 6   target  1639424 non-null  object        
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 87.6+ MB


(        Date        X1        X2   X3        X4        X5 target
 0 2020-12-16  1.518921  5.463154  1.0  2.718282  2.890372      0
 1 2020-12-16  1.546509  5.458010  1.0  2.718282  2.833213      1
 2 2020-12-16  1.645427  5.456560  1.0  7.389056  2.890372      1
 3 2020-12-16  1.652022  5.458479  1.0  2.718282  2.890372      1
 4 2020-12-16  1.695538  5.466709  1.0  2.718282  2.890372      0,
 None,
 target
 0    1625386
 1      14038
 Name: count, dtype: int64)

The dataset is highly imbalanced, with anomalies forming only around 0.8% 
of the total data. Therefore, accuracy alone is not sufficient and F1 score 
is used as the primary evaluation metric.

In [3]:

train['target'] = train['target'].astype(int)


train['hour'] = train['Date'].dt.hour
train['day'] = train['Date'].dt.day
train['weekday'] = train['Date'].dt.weekday

test['hour'] = test['Date'].dt.hour
test['day'] = test['Date'].dt.day
test['weekday'] = test['Date'].dt.weekday


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

X = train.drop(['target', 'Date'], axis=1)
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print("F1:", f1_score(y_val, y_pred))


F1: 0.0


ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
import numpy as np
from sklearn.metrics import precision_recall_curve

y_probs = model.predict_proba(X_val)[:,1]
precision, recall, thresholds = precision_recall_curve(y_val, y_probs)
f1_scores = 2 * precision * recall / (precision + recall)

best_threshold = thresholds[np.argmax(f1_scores)]
best_threshold


np.float64(0.5)

In [6]:
y_custom = (y_probs > best_threshold).astype(int)
print("Optimized F1:", f1_score(y_val, y_custom))


Optimized F1: 0.0


Since the dataset is extremely imbalanced, 
using default 0.5 threshold leads to poor recall. 
Therefore, I optimized the decision threshold using 
precision-recall curve to maximize F1 score.


In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

rf.fit(X_train, y_train)
y_probs = rf.predict_proba(X_val)[:,1]

from sklearn.metrics import precision_recall_curve, f1_score
import numpy as np

precision, recall, thresholds = precision_recall_curve(y_val, y_probs)
f1_scores = 2 * precision * recall / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

y_custom = (y_probs > best_threshold).astype(int)
print("RF Best threshold:", best_threshold)
print("RF Optimized F1:", f1_score(y_val, y_custom))


RF Best threshold: 0.9426767788865519
RF Optimized F1: 0.5754260089686098


In [8]:
for col in ['X1','X2','X3','X4','X5']:
    train[f'{col}_roll_mean_5'] = train[col].rolling(5).mean()
    train[f'{col}_roll_std_5'] = train[col].rolling(5).std()
    
    test[f'{col}_roll_mean_5'] = test[col].rolling(5).mean()
    test[f'{col}_roll_std_5'] = test[col].rolling(5).std()

train = train.fillna(0)
test = test.fillna(0)


I added rolling mean and rolling standard deviation features to capture 
short-term temporal behavior of sensors, which is important for anomaly 
detection in time-series-like data.

Tree-based ensemble models are well suited for this problem since the 
relationship between sensor readings and anomalies is highly non-linear 
and complex. Therefore, CatBoost is used as the final model.

In [9]:
from catboost import CatBoostClassifier

cb = CatBoostClassifier(
    iterations=300,
    depth=8,
    learning_rate=0.1,
    loss_function='Logloss',
    eval_metric='F1',
    verbose=0
)

cb.fit(X_train, y_train)
y_probs = cb.predict_proba(X_val)[:,1]


In [10]:
from sklearn.metrics import precision_recall_curve, f1_score
import numpy as np

precision, recall, thresholds = precision_recall_curve(y_val, y_probs)
f1_scores = 2 * precision * recall / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

y_custom = (y_probs > best_threshold).astype(int)

print("CatBoost Best threshold:", best_threshold)
print("CatBoost Optimized F1:", f1_score(y_val, y_custom))


CatBoost Best threshold: 0.34403520796379533
CatBoost Optimized F1: 0.7297830374753451


In [11]:
cb.fit(X, y)


<catboost.core.CatBoostClassifier at 0x7fb555b1aa80>

In [12]:
test_probs = cb.predict_proba(test.drop('Date', axis=1))[:,1]
test_preds = (test_probs > best_threshold).astype(int)


In [13]:
import os
os.listdir("/kaggle/input")


['ana-verse-2-0-j']

In [14]:
os.listdir("/kaggle/input/ana-verse-2-0-j")


['train.parquet', 'sample_submission.parquet', 'test.parquet']

In [15]:
import pandas as pd

test_df = pd.read_parquet("/kaggle/input/ana-verse-2-0-j/test.parquet")

submission = pd.DataFrame({
    "ID": test_df["ID"],
    "target": test_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,ID,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
