<a href="https://www.kaggle.com/code/alihassanshahid/water-quality-monitoring-ml-model?scriptVersionId=197749085" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Water Quality Monitoring using Supervised Machine Learning
# 1.1 Intro

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv("/kaggle/input/water-quality-monitoring-dataset/brisbane_water_quality.csv")

In [None]:
df.head()

In [None]:
df.shape

# 1.2 Impute Missing Values

In [None]:
# Missing values
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
print(missing_values)

# Handle missing values: fill or drop
df = df.dropna()  # Or df.fillna(value) to fill


# 1.3 Removing Outliers

In [None]:
from scipy import stats

# Z-scores for detecting outliers
z_scores = stats.zscore(df.select_dtypes(include='number'))

# Threshold for outliers (you can set it to 3 standard deviations)
outliers = (z_scores > 3).sum(axis=1)
outlier_rows = df[outliers > 0]
print(outlier_rows)

# You can visualize outliers using boxplots as well (as shown earlier).


# 1.4 Data Visualizations

## 1.4.1 Polar plot for water direction and speed

In [None]:
# Polar plot for water direction and speed
df_sample = df.head(1000)  # For readability, use a sample

plt.figure(figsize=(10, 8))
ax = plt.subplot(111, projection='polar')

# Convert degrees to radians for plotting
angles = np.deg2rad(df_sample['Average Water Direction'])
speeds = df_sample['Average Water Speed']

# Create the polar plot
ax.scatter(angles, speeds, c=speeds, cmap='coolwarm', alpha=0.75)
ax.set_title('Polar Plot of Water Direction and Speed', fontsize=16)
plt.show()


## 1.4.2 Violin plot comparing 'Chlorophyll' values based on quality

In [None]:
# Violin plot comparing 'Chlorophyll' values based on quality
plt.figure(figsize=(12, 8))
sns.violinplot(x='Chlorophyll [quality]', y='Chlorophyll', data=df, split=True)
plt.title('Violin Plot of Chlorophyll Distribution by Quality', fontsize=16)
plt.show()


## 1.4.3 Network Graph (In Progress)

In [None]:
import networkx as nx

# Drop non-numeric columns for correlation calculation
numeric_df = df.drop(columns=['Timestamp', 'Record number', 'Chlorophyll [quality]'])

# Calculate the correlation matrix
corr_matrix = numeric_df.corr()

# Create a network graph from the correlation matrix
G = nx.Graph()

# Add nodes
for col in corr_matrix.columns:
    G.add_node(col)

# Add edges (only strong correlations for clarity)
threshold = 0.6
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            G.add_edge(corr_matrix.columns[i], corr_matrix.columns[j], weight=corr_matrix.iloc[i, j])

# Draw the graph
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G)  # Positions the nodes
edges = G.edges(data=True)

nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=3000, font_size=10)
nx.draw_networkx_edges(G, pos, width=[d['weight'] * 5 for (u, v, d) in edges], edge_color='grey')
plt.title('Correlation Matrix as a Network Graph', size=16)
plt.show()


## 1.4.4 Histogram for numeric values

In [None]:
df_sample = df.head(500)
# Plot histograms for each numeric column
df_sample.hist(figsize=(16, 12), bins=20)
plt.tight_layout()
plt.show()


## 1.4.5 Boxplot to visualize outliers (In Progress)

In [None]:
# Boxplot to visualize outliers for each variable
plt.figure(figsize=(16, 10))
sns.boxplot(data=df.drop(columns=['Timestamp', 'Record number']))
plt.xticks(rotation=45)
plt.show()

## 1.4.6 Time Series Analysis 

In [None]:
# Convert timestamp to datetime if itâ€™s not already
df_sample = df.head(500)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Plot time-series for key variables
plt.figure(figsize=(14, 8))
for col in ['Temperature', 'Dissolved Oxygen', 'Chlorophyll', 'pH', 'Salinity']:
    plt.plot(df_sample['Timestamp'], df_sample[col], label=col)

plt.legend()
plt.title('Time Series of Key Environmental Variables')
plt.xlabel('Timestamp')
plt.ylabel('Value')
plt.show()


## 1.4.7 3D plot: Temperature vs Salinity vs Time

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Prepare the data
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Use the first 1000 rows to prevent clutter
df_sample = df.head(1000)

# Convert Timestamp to a numerical value (e.g., seconds since epoch)
df_sample['Timestamp'] = pd.to_datetime(df_sample['Timestamp']).astype(int) / 10**9

# Drop duplicates if necessary
df_sample = df_sample.drop_duplicates(subset=['Timestamp', 'Salinity', 'Temperature'])

# Create a 3D plot
ax.plot_trisurf(df_sample['Timestamp'], df_sample['Salinity'], df_sample['Temperature'], cmap='coolwarm', linewidth=0.2)

ax.set_title('3D Surface Plot: Temperature vs. Salinity vs. Time', fontsize=16)
ax.set_xlabel('Timestamp')
ax.set_ylabel('Salinity')
ax.set_zlabel('Temperature')
plt.show()


# 1.5 Supervised ML implementation