In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Adjust the file path as necessary
data = pd.read_csv('data/raw/water_potability 2.csv')
print(data.head())
print(data.info())
print(data.describe())


In [None]:
print(data.isnull().sum())


In [None]:
data.dropna(inplace=True)

# Verify that null values have been removed
print(data.isnull().sum())

In [None]:
print(data.info())


In [47]:
#Feature enginerring
features = data.drop('Potability', axis=1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
data_scaled = pd.DataFrame(scaled_features, columns=features.columns)
data_scaled['Potability'] = data['Potability'].values

In [None]:
# Select numerical columns for box plot visualization
numeric_columns = data.select_dtypes(include=['number']).columns
num_plots = len(numeric_columns)

# Set number of columns for subplots (adjust as needed)
cols = 5
rows = (num_plots // cols) + (num_plots % cols > 0)

# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4), constrained_layout=True)
axes = axes.flatten()

# Generate box plots for each numeric column
for idx, col in enumerate(numeric_columns):
    sns.boxplot(x=data[col], ax=axes[idx])
    axes[idx].set_title(f'Box Plot for {col}')
    axes[idx].set_xlabel(col)

# Remove any unused subplots if the grid is larger than the number of columns
for j in range(idx + 1, len(axes)):
    fig.delaxes(axes[j])

plt.suptitle('Box Plots for Numerical Features', fontsize=16)
plt.show()


In [None]:
# Select numerical columns
numeric_columns = data.select_dtypes(include=['number']).columns

# Create a copy of the original DataFrame for outlier removal
clean_data = data.copy()

# Remove outliers based on the IQR method for each numeric column
for col in numeric_columns:
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = clean_data[col].quantile(0.25)
    Q3 = clean_data[col].quantile(0.75)
    
    # Compute the IQR
    IQR = Q3 - Q1
    
    # Define the acceptable bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter the data to remove outliers
    clean_data = clean_data[(clean_data[col] >= lower_bound) & (clean_data[col] <= upper_bound)]
    
# Display the number of rows before and after outlier removal
print("Original number of rows:", data.shape[0])
print("Number of rows after outlier removal:", clean_data.shape[0])


Too many rows lost as outliers, hence wont be removing outliers

<h3>Univariate Analysis</h3>

In [None]:
# Number of columns to be plotted
columns_to_plot = data_scaled.columns[:-1]
num_plots = len(columns_to_plot)

# Define the grid dimensions (e.g., 3 columns per row)
cols = 5
rows = (num_plots // cols) + (num_plots % cols > 0)

# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(cols*5, rows*4), constrained_layout=True)

# Flatten the axes array for easy iteration (if needed)
axes = axes.flatten()

# Plot each column's histogram with KDE on a separate subplot
for idx, col in enumerate(columns_to_plot):
    sns.histplot(data_scaled[col], kde=True, ax=axes[idx])
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].set_xlabel(col)

# Remove any unused subplots if total plots is not a multiple of cols
for j in range(idx + 1, len(axes)):
    fig.delaxes(axes[j])

plt.suptitle('Histograms of Numerical Features', fontsize=16)
plt.show()


<h3>Bivariate Analysis</h3>

In [None]:


sns.boxplot(x='Potability', y='ph', data=data_scaled)
plt.title("pH vs Water Potability")
plt.show()


<h3>Correlation Matrix</h3>

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(data_scaled.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


<h2>Model Development</h2>

In [53]:
from sklearn.model_selection import train_test_split

X = data_scaled.drop('Potability', axis=1)
y = data_scaled['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, predictions):.2f}')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset and preprocess it as needed (assuming data_scaled is your preprocessed DataFrame)
X = data_scaled.drop('Potability', axis=1)
y = data_scaled['Potability']

# Split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Set up GridSearchCV with cross-validation (here, using 5-fold CV)
grid_search = GridSearchCV(estimator=rf_classifier,
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',  # Use the appropriate metric for your project objectives
                           n_jobs=-1)

# Fit the grid search model on the training data
grid_search.fit(X_train, y_train)

# Extract the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)

# Evaluate the best model on the test set
predictions = best_model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))


In [None]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Load your preprocessed data (assuming data_scaled has been created)
# Here we assume 'Potability' is your target column
X = data_scaled.drop('Potability', axis=1)
y = data_scaled['Potability']

# Show the original distribution of the target variable
print("Original class distribution:")
print(y.value_counts())

# Initialize the RandomUnderSampler with a fixed random_state for reproducibility
undersampler = RandomUnderSampler(random_state=42)

# Apply under-sampling to the features and target variable
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Check the new class distribution after under-sampling
print("\nClass distribution after under-sampling:")
print(pd.Series(y_resampled).value_counts())

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.2,
                                                    random_state=42)

# Now you can proceed to train your model using X_train and y_train,
# and evaluate it on X_test and y_test to see if precision improves.
# Define a parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', 
                                   eval_metric='logloss', 
                                   use_label_encoder=False, 
                                   random_state=42)

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier,
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',
                           n_jobs=-1, verbose=1)

# Fit the grid search model on the training data
grid_search.fit(X_train, y_train)

# Extract the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)

# Evaluate the best model on the test set
predictions = best_model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))


In [None]:
import joblib
joblib.dump(model, 'models/final_water_quality_model.pkl')


In [64]:
import streamlit as st
import joblib
import pandas as pd

# Load the pre-trained model from the specified path
model = joblib.load('models/final_water_quality_model.pkl')

# Set the title of the dashboard
st.title('Water Potability Predictor')
st.write("Enter the water quality measurements below:")

# Create widgets for user input. Modify the default values as per your dataset's scale.
pH = st.slider('pH', min_value=0.0, max_value=14.0, value=7.0, step=0.1)
hardness = st.number_input('Hardness', value=200, step=1)
solids = st.number_input('Solids (ppm)', value=10000, step=100)
chloramines = st.number_input('Chloramines (ppm)', value=7.0, step=0.1)
sulfate = st.number_input('Sulfate (ppm)', value=300, step=1)
conductivity = st.number_input('Conductivity (µS/cm)', value=500, step=1)
organic_carbon = st.number_input('Organic Carbon (ppm)', value=10.0, step=0.1)
trihalomethanes = st.number_input('Trihalomethanes (ppm)', value=50.0, step=0.1)
turbidity = st.number_input('Turbidity (NTU)', value=4.0, step=0.1)

# Create a DataFrame from the user inputs that matches the model's expected input
input_data = pd.DataFrame({
    'pH': [pH],
    'Hardness': [hardness],
    'Solids': [solids],
    'Chloramines': [chloramines],
    'Sulfate': [sulfate],
    'Conductivity': [conductivity],
    'Organic_carbon': [organic_carbon],
    'Trihalomethanes': [trihalomethanes],
    'Turbidity': [turbidity]
})

# When the user clicks the Predict button, generate predictions using the model.
if st.button('Predict Potability'):
    prediction = model.predict(input_data)
    result = 'Potable' if prediction[0] == 1 else 'Non-Potable'
    st.success(f'The water is predicted to be: {result}')


