In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [8]:
data=pd.read_csv("https://raw.githubusercontent.com/Chatterjeesoma16/AnomaData/main/AnomaData.csv")

In [9]:
# Display basic information about the DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18398 entries, 0 to 18397
Data columns (total 62 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   time    18398 non-null  object 
 1   y       18398 non-null  int64  
 2   x1      18398 non-null  float64
 3   x2      18398 non-null  float64
 4   x3      18398 non-null  float64
 5   x4      18398 non-null  float64
 6   x5      18398 non-null  float64
 7   x6      18398 non-null  float64
 8   x7      18398 non-null  float64
 9   x8      18398 non-null  float64
 10  x9      18398 non-null  float64
 11  x10     18398 non-null  float64
 12  x11     18398 non-null  float64
 13  x12     18398 non-null  float64
 14  x13     18398 non-null  float64
 15  x14     18398 non-null  float64
 16  x15     18398 non-null  float64
 17  x16     18398 non-null  float64
 18  x17     18398 non-null  float64
 19  x18     18398 non-null  float64
 20  x19     18398 non-null  float64
 21  x20     18398 non-null  float64
 22

In [10]:
# Display summary statistics
data.describe()


Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x51,x52,x54,x55,x56,x57,x58,x59,x60,y.1
count,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0,...,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0,18398.0
mean,0.00674,0.011824,0.157986,0.5693,-9.958345,0.006518,2.387533,0.001647,-0.004125,-0.003056,...,-3.357339,0.380519,0.173708,2.379154,9.234953,0.233493,-0.001861,-0.061522,0.001258,0.001033
std,0.081822,0.742875,4.939762,5.937178,131.033712,0.634054,37.104012,0.10887,0.07546,0.156047,...,348.256716,6.211598,3.029516,67.940694,81.274103,2.326838,0.048732,10.394085,0.004721,0.03212
min,0.0,-3.787279,-17.31655,-18.198509,-322.78161,-1.623988,-279.40844,-0.429273,-0.451141,-0.120087,...,-3652.989,-187.94344,-8.21037,-230.57403,-269.0395,-12.64037,-0.14979,-100.8105,-0.012229,0.0
25%,0.0,-0.405681,-2.158235,-3.537054,-111.378372,-0.446787,-24.345268,-0.05852,-0.051043,-0.059966,...,29.984624,-3.672684,0.48778,-40.050046,-45.519149,-1.598804,0.00047,0.295023,-0.001805,0.0
50%,0.0,0.128245,-0.075505,-0.190683,-14.881585,-0.120745,10.528435,-0.009338,-0.000993,-0.030057,...,29.984624,0.294846,0.702299,17.471317,1.438806,0.085826,0.012888,0.734591,0.00071,0.0
75%,0.0,0.421222,2.319297,3.421223,92.199134,0.325152,32.172974,0.060515,0.038986,0.00199,...,29.984624,5.109543,2.675751,44.093387,63.209681,2.222118,0.020991,1.266506,0.004087,0.0
max,1.0,3.054156,16.742105,15.900116,334.694098,4.239385,96.060768,1.70559,0.788826,4.060033,...,40.152348,14.180588,6.637265,287.252017,252.147455,6.922008,0.067249,6.98546,0.02051,1.0


In [12]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 time    0
y       0
x1      0
x2      0
x3      0
       ..
x57     0
x58     0
x59     0
x60     0
y.1     0
Length: 62, dtype: int64


In [26]:
#Treat minnf Values
data.dropna(inplace=True)

In [29]:
# Treat Outliers using Z-score method
def treat_outliers_zscore(data, thresold=3):
    z_scores=np.abs((data - data.mean()) / data.std())
    data_filtered = data[z_scores < thresold]
    return data_filtered

In [30]:
# Extract features and target variables
features = data.drop(columns=['time', 'y', 'y.1'])
target_y = data['y']
target_y1 = data['y.1']


In [31]:
# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [16]:
# Convert the scaled features back to a DataFrame
scaled_features_df = pd.DataFrame(scaled_features, columns=features.columns)
print(scaled_features_df.head())

         x1        x2        x3        x4        x5        x6        x7  \
0  0.491135 -0.962506 -0.785758  0.179012 -0.197698 -0.621443 -0.008403   
1  0.624478 -0.951588 -0.772721  0.199870 -0.213317 -0.569916 -0.008403   
2  0.473881 -0.979705 -0.829111  0.183823 -0.228936 -0.545082  0.084105   
3  0.390072 -0.995403 -0.773606  0.176447 -0.243928 -0.563435  0.003933   
4  0.341594 -0.993580 -0.825743  0.192518 -0.255240 -0.536168 -0.008403   

         x8        x9       x10  ...       x50       x51       x52       x54  \
0 -0.755236 -0.364704 -0.334939  ...  1.656745  0.095742  1.563441 -1.686831   
1 -0.755236 -0.364704 -0.334939  ...  1.656065  0.095742  1.564109 -1.687077   
2 -0.755236 -0.173032 -0.148717  ...  1.655384  0.095742  1.564816 -1.687323   
3 -0.755236 -0.108492 -0.054165  ...  1.654703  0.095742  1.565524 -1.687569   
4 -0.755236 -0.173032 -0.054165  ...  1.654022  0.095742  1.566231 -1.687815   

        x55       x56       x57       x58       x59       x60  
0 -0

In [32]:
from sklearn.ensemble import IsolationForest

# Train the Isolation Forest model
model = IsolationForest(contamination=0.01)  # Adjust contamination based on the percentage of expected anomalies
model.fit(scaled_features_df)

# Predict anomalies
anomalies = model.predict(scaled_features_df)

# Map anomalies to the target variable format (1 for anomaly, -1 for normal)
anomalies = [1 if x == -1 else 0 for x in anomalies]

# Add the anomalies to the DataFrame
scaled_features_df['anomaly'] = anomalies

# Check the results
print(scaled_features_df['anomaly'].value_counts())


anomaly
0    18214
1      184
Name: count, dtype: int64


In [33]:
from sklearn.metrics import classification_report

# Generate classification report
print(classification_report(target_y, anomalies))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     18274
           1       0.01      0.01      0.01       124

    accuracy                           0.98     18398
   macro avg       0.50      0.50      0.50     18398
weighted avg       0.99      0.98      0.98     18398

