In [1]:
def count_values(arr):
  unique_values, counts = np.unique(arr, return_counts=True)

  # Print the unique values and their frequencies
  for value, count in zip(unique_values, counts):
      print(f"{value}: {count}")

In [2]:
def split_data_balanced(X, y, test_size=0.2, random_state=None):
    # Find unique labels and their counts
    unique_labels, label_counts = np.unique(y, return_counts=True)

    # Find the minimum label count
    min_label_count = np.min(label_counts)

    # Split the data for each label, ensuring balanced classes in the test set
    X_train, X_test, y_train, y_test = [], [], [], []
    for label in unique_labels:
        # Split the data for the current label
        X_label = X[y == label]
        y_label = y[y == label]
        X_label_train, X_label_test, y_label_train, y_label_test = train_test_split(
            X_label, y_label, test_size=test_size, random_state=random_state
        )

        # Add the split data to the overall train and test sets
        X_train.append(X_label_train)
        X_test.append(X_label_test)
        y_train.append(y_label_train)
        y_test.append(y_label_test)

    # Concatenate the data from all labels
    X_train = np.concatenate(X_train)
    X_test = np.concatenate(X_test)
    y_train = np.concatenate(y_train)
    y_test = np.concatenate(y_test)

    return X_train, X_test, y_train, y_test


In [3]:
def segmantation(X,y,window_length=36 ,step_size=1):
      # Define sliding window parameters
      window_length = 36  # Length of each segment
      step_size = 1  # Amount of overlap between segments
      # Segment the time series data with sliding window
      segments = []
      labels = []

      for i in range(0, len(X) - window_length, step_size):
          segment = X[i:i+window_length]
          segments.append(segment)
          
          # Assign label to the segment based on the presence of anomalies
          segment_labels = y[i:i+window_length]
          if np.any(segment_labels == 1):
              label = 1  # Anomaly present
          else:
              label = 0  # No anomaly
          labels.append(label)

      # Convert segments and labels to numpy arrays
      segments = np.array(segments)
      labels = np.array(labels)
      return segments,labels

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

file_path =r"C:\Users\hp\Desktop\M2\PFE\Code\code pfe\Coud source\Code\machine learnig algorithme test\data_kick.xlsx"
df=pd.read_excel(file_path)

In [6]:
X = df[[ 'TVA (m3)', 'SPPA (kPa)', 'MFOP ((m3/s)/(m3/s))', 'GASA (mol/mol)']]
y = df['STATUS']

In [49]:
df[df['STATUS']==1].index

Int64Index([14571, 14572, 14573, 14574, 14575, 14576, 14577, 14578, 14579,
            14580,
            ...
            53240, 53241, 53242, 53243, 53244, 53245, 53246, 53247, 53248,
            53249],
           dtype='int64', length=1238)

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Select the columns to normalize
# Perform the min-max normalization
X= scaler.fit_transform(X)

In [8]:
window=36
segments,labels= segmantation(X,y,window_length=window ,step_size=1)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = split_data_balanced(segments, labels, test_size=0.2)
print(X_train.shape,X_test.shape)
# Reshape the feature matrices for SVM
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
print(X_train.shape,X_test.shape)

(42570, 36, 4) (10644, 36, 4)
(42570, 144) (10644, 144)


In [10]:
count_values(y_test)
count_values(y_train)

0: 10354
1: 290
0: 41413
1: 1157


In [None]:
pip install tslearn

In [11]:
from tslearn.metrics import dtw
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
knn = KNeighborsClassifier(n_neighbors=3,metric=dtw)
knn.fit(X_train, y_train)


In [35]:
min=0.2
max=0.8
x=0.6
print((x-min)/(max-min))

0.6666666666666665


In [12]:

y_prd=[]

In [22]:
for i in range(len(y_test) - 1,9642 , -1): 
    y_pred = knn.predict(X_test[i-1:i])
    y_prd.append(y_pred[0])
    print(i,i-1)

In [None]:

for i in range(1000):
     y_pred = knn.predict(X_test[i:i+1])
     y_prd.append(y_pred[0])
     print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42


In [90]:
count_values(y_prd)

0: 999
1: 1


In [92]:
count_values(y_test[0:1000])

0: 1001


In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score,roc_auc_score

print(confusion_matrix(y_test[0:1000],y_prd))
print("Accuracy:", accuracy_score(y_test[0:1000],y_prd))
print("Precision:", precision_score(y_test[0:1000],y_prd))
print("Recall:", recall_score(y_test[0:1000],y_prd))
print("F1 Score:",f1_score(y_test[0:1000],y_prd))

In [96]:
y_pred = knn.predict(X_test)

In [98]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score,roc_auc_score

print(confusion_matrix(y_test,y_pred))
print("Accuracy:", accuracy_score(y_test,y_pred))
print("Precision:", precision_score(y_test,y_pred))
print("Recall:", recall_score(y_test,y_pred))
print("F1 Score:",f1_score(y_test,y_pred))


[[10342    12]
 [   95   195]]
Accuracy: 0.9899473881999249
Precision: 0.9420289855072463
Recall: 0.6724137931034483
F1 Score: 0.7847082494969819


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
print(X_train.shape,X_test.shape)
# Reshape the feature matrices for SVM
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
print(X_train.shape,X_test.shape)

(42594, 4) (10649, 4)
(42594, 4) (10649, 4)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial.distance import cdist

def dtw_distance(ts1, ts2, window):
    """
    Computes the DTW distance between two time series with a given window.
    """
    n1, n2 = len(ts1), len(ts2)
    w = np.max([window, abs(n1 - n2)])
    dtw = np.full((n1 + 1, n2 + 1), np.inf)
    dtw[0, 0] = 0
    for i in range(1, n1 + 1):
        for j in range(np.max([1, i - w]), np.min([n2, i + w]) + 1):
            cost = abs(ts1[i - 1] - ts2[j - 1])
            dtw[i, j] = cost + np.min([dtw[i - 1, j], dtw[i, j - 1], dtw[i - 1, j - 1]])
    return dtw[-1, -1]

# Define the kNN classifier with the custom DTW distance metric
knn = KNeighborsClassifier(n_neighbors=5, metric=lambda x, y: dtw_distance(x, y, window=36))

# Fit the classifier to the training data



In [None]:
knn.fit(X_train, y_train)



(10649,)

In [None]:
# Make predictions on new data
y_pred = knn.predict(X_test)