In [1]:
import dask.dataframe as dd

# Read CSV using Dask
data = dd.read_csv("E:/semester8/Grad2/dataset_after_oversampling/binary_classification.csv")

# Use .compute() to execute operations and get the information
data_info = data.compute().info()
print(data_info)

# Get the number of rows and columns using .compute()
num_rows, num_columns = data.compute().shape
print("Number of rows4:", num_rows)
print("Number of columns4:", num_columns)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21087035 entries, 0 to 381465
Data columns (total 21 columns):
 #   Column           Dtype  
---  ------           -----  
 0   TP2              float64
 1   TP3              float64
 2   H1               float64
 3   DV_pressure      float64
 4   Reservoirs       float64
 5   Oil_temperature  float64
 6   Flowmeter        float64
 7   Motor_current    float64
 8   COMP             int64  
 9   DV_eletric       int64  
 10  Towers           int64  
 11  MPG              int64  
 12  LPS              int64  
 13  Pressure_switch  int64  
 14  Oil_level        int64  
 15  Caudal_impulses  int64  
 16  gpsLong          float64
 17  gpsLat           float64
 18  gpsSpeed         int64  
 19  gpsQuality       int64  
 20  status           int64  
dtypes: float64(10), int64(11)
memory usage: 3.5 GB
None
Number of rows4: 21087035
Number of columns4: 21


In [2]:
pandas_data = data.compute()
print(pandas_data.shape)

(21087035, 21)


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import train_test_split

In [4]:
label_column = 'status'
X = pandas_data.drop(label_column, axis=1)
y = pandas_data[label_column]

print(X)
print(y)

             TP2       TP3        H1  DV_pressure  Reservoirs  \
0      -0.012000  9.758000  9.760000       -0.028    1.576000   
1      -0.012000  9.760000  9.760000       -0.028    1.578000   
2      -0.010000  9.760000  9.760000       -0.028    1.578000   
3      -0.012000  9.756000  9.756000       -0.030    1.576000   
4      -0.012000  9.756000  9.756000       -0.030    1.578000   
...          ...       ...       ...          ...         ...   
381461 -0.009288  8.710410  8.372109       -0.034    1.468000   
381462 -0.008132  8.713342  8.406711       -0.034    1.466132   
381463 -0.009825  8.705223  8.362795       -0.034    1.467825   
381464 -0.009911  8.704800  8.361511       -0.034    1.468000   
381465 -0.008126  8.713372  8.406859       -0.034    1.466126   

        Oil_temperature  Flowmeter  Motor_current  COMP  DV_eletric  Towers  \
0             63.350000  19.049625         3.9550     1           0       1   
1             63.250000  19.049625         4.0275     1      

In [5]:
from sklearn.preprocessing import StandardScaler

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [6]:

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [7]:

# LSTM expects input data in a specific format (samples, time steps, features)
# Reshape the data for LSTM
X_train_reshaped = np.array(X_train).reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))
print(X_train_reshaped.shape)
print(X_test_reshaped.shape)

(14760924, 1, 20)
(6326111, 1, 20)


In [8]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.models import load_model

from keras.layers import LSTM
from keras.callbacks import EarlyStopping

# Build the LSTM model
model = Sequential()


# First LSTM layer
model.add(LSTM(units=100, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True))
model.add(Dropout(0.2))

# Second LSTM layer
model.add(LSTM(units=100, return_sequences=False))  # set return_sequences=False
model.add(Dropout(0.2))


# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train_reshaped,
                     y_train, 
                     epochs=100
                     , batch_size=32
                     , validation_data=(X_test_reshaped, y_test),
                     callbacks=[early_stopping],
                       verbose=1)


# Save the model
model.save("LSTM_LSTM_binary.h5")

# Load the model
loaded_model = load_model("LSTM_LSTM_binary.h5")

# Evaluate the loaded model
loss, accuracy = loaded_model.evaluate(X_test_reshaped, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")




Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


  saving_api.save_model(


Test Accuracy: 90.87%


In [9]:
# Print the summary of the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 100)            48400     
                                                                 
 dropout (Dropout)           (None, 1, 100)            0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 128901 (503.52 KB)
Trainable params: 128901 (503.52 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
