# Research Topic: Water Quality Monitoring System using Recurrent Neural Network (RNNs) and Internet of Things (IoT)


In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, r2_score


In [5]:
data = pd.read_csv("C:/Users/muham/OneDrive/Desktop/python_folder/Water_collection/brisbane_water_quality/brisbane_water_quality.csv")

In [6]:
data.head()

Unnamed: 0,Timestamp,Record number,Average Water Speed,Average Water Direction,Chlorophyll,Chlorophyll [quality],Temperature,Temperature [quality],Dissolved Oxygen,Dissolved Oxygen [quality],Dissolved Oxygen (%Saturation),Dissolved Oxygen (%Saturation) [quality],pH,pH [quality],Salinity,Salinity [quality],Specific Conductance,Specific Conductance [quality],Turbidity,Turbidity [quality]
0,2023-08-04 23:00:00,1468,4.834,73.484,1.621,,20.018,,7.472,,101.175,,8.176,,35.215,,53.262,,2.068,
1,2023-08-04 23:30:00,1469,2.544,106.424,1.959,,19.986,,7.455,,100.884,,8.175,,35.209,,53.254,,1.994,
2,2023-08-04 23:00:00,1470,1.26,156.755,1.62,,20.001,,7.43,,100.571,,8.171,,35.207,,53.252,,2.03,
3,2023-08-04 23:30:00,1471,0.76,281.754,1.761,,19.983,,7.419,,100.398,,8.171,,35.211,,53.257,,1.973,
4,2023-08-04 23:00:00,1472,3.397,244.637,1.635,,19.986,,7.429,,100.538,,8.171,,35.208,,53.253,,1.944,


In [7]:
# Identify non-numeric columns
non_numeric_columns = data.select_dtypes(exclude=[np.number]).columns
print(non_numeric_columns)

# Fill NaNs only in numeric columns
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())


Index(['Timestamp'], dtype='object')


In [8]:
# Drop rows with NaNs in the remaining columns
data_clean = data.dropna(subset=['Average Water Speed', 'Average Water Direction', 'Chlorophyll', 
                                 'Chlorophyll [quality]', 'Temperature', 'Temperature [quality]',
                                 'Dissolved Oxygen', 'Dissolved Oxygen [quality]', 
                                 'Dissolved Oxygen (%Saturation)', 'pH', 'Salinity', 
                                 'Specific Conductance', 'Turbidity'])

# Select relevant features and target
features = ['Average Water Speed', 'Average Water Direction', 'Chlorophyll', 
            'Chlorophyll [quality]', 'Temperature [quality]',
            'Dissolved Oxygen', 'Dissolved Oxygen [quality]', 
            'Dissolved Oxygen (%Saturation)', 'pH', 'Salinity', 
            'Specific Conductance', 'Turbidity']
target = 'Temperature'

# Prepare the feature and target arrays
X = data_clean[features].values
y = data_clean[target].values


In [9]:
# Normalize data
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()


In [10]:
# Check if normalization worked
print(np.any(np.isnan(X_scaled)))
print(np.any(np.isinf(X_scaled)))
print(np.any(np.isnan(y_scaled)))
print(np.any(np.isinf(y_scaled)))


False
False
False
False


In [11]:
# Create sequences
def create_sequences(X, y, time_steps):
    X_seq, y_seq = [], []
    for i in range(len(X) - time_steps):
        X_seq.append(X[i:i + time_steps])
        y_seq.append(y[i + time_steps])
    return np.array(X_seq), np.array(y_seq)


In [12]:
time_steps = 10
X_seq, y_seq = create_sequences(X_scaled, y_scaled, time_steps)


In [13]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(1))

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4, clipvalue=1.0), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)


  super().__init__(**kwargs)


Epoch 1/50
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 18ms/step - loss: 0.0723 - val_loss: 0.0294
Epoch 2/50
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 21ms/step - loss: 0.0286 - val_loss: 0.0181
Epoch 3/50
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step - loss: 0.0210 - val_loss: 0.0164
Epoch 4/50
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 26ms/step - loss: 0.0194 - val_loss: 0.0162
Epoch 5/50
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step - loss: 0.0189 - val_loss: 0.0155
Epoch 6/50
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 21ms/step - loss: 0.0177 - val_loss: 0.0156
Epoch 7/50
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step - loss: 0.0176 - val_loss: 0.0146
Epoch 8/50
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step - loss: 0.0167 - val_loss: 0.0143
Epoch 9/50
[1m773/773[

In [14]:

# Make predictions
y_pred_scaled = model.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1))

# Calculate metrics
mse = mean_squared_error(y_test_original, y_pred)
r2 = r2_score(y_test_original, y_pred)

print('Mean Squared Error:', mse)
print('R-Squared:', r2)


[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step
Mean Squared Error: 1.5743688965628602
R-Squared: 0.8420559043627323
