# Water Quality Prediction - Week 1
## Project Setup and Data Analysis
### 1. Importing Required Libraries

In [9]:
# Data Handling
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Settings
%matplotlib inline
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

print("All required libraries are imported!")

All required libraries are imported!


### 2. Data Loading and Initial Exploration

In [10]:
# Load the dataset
# Note: Please upload your dataset to the 'data' folder and update the filename below
file_path = 'C:/Users/shali/Desktop/aicte internship folder/afa2e701598d20110228.csv' # Update this with your actual file name
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
    display(df.head())
    print("\nDataset Info:")
    df.info()
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please make sure the file exists.")
    print("Typical column names in water quality datasets might include:")
    print("- pH, Hardness, Solids, Chloramines, Sulfate, Conductivity, Organic_carbon, "
          "Trihalomethanes, Turbidity, Potability")

Dataset loaded successfully!


Unnamed: 0,id;date;NH4;BSK5;Suspended;O2;NO3;NO2;SO4;PO4;CL
0,1;17.02.2000;0.33;2.77;12;12.3;9.5;0.057;154;0...
1,1;11.05.2000;0.044;3;51.6;14.61;17.75;0.034;35...
2,1;11.09.2000;0.032;2.1;24.5;9.87;13.8;0.173;41...
3,1;13.12.2000;0.17;2.23;35.6;12.4;17.13;0.099;2...
4,1;02.03.2001;0;3.03;48.8;14.69;10;0.065;281.6;...



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 1 columns):
 #   Column                                            Non-Null Count  Dtype 
---  ------                                            --------------  ----- 
 0   id;date;NH4;BSK5;Suspended;O2;NO3;NO2;SO4;PO4;CL  2861 non-null   object
dtypes: object(1)
memory usage: 22.5+ KB


### 3. Data Preprocessing

In [None]:
def preprocess_data(df):
    # Make a copy of the dataframe
    df_clean = df.copy()
    
    # 1. Convert date column to datetime if it exists
    if 'date' in df_clean.columns:
        df_clean['date'] = pd.to_datetime(df_clean['date'], format='%d.%m.%Y')
    
    # 2. Convert all columns to numeric, coercing errors to NaN
    for col in df_clean.select_dtypes(include=['object']).columns:
        if col != 'date':  # Skip date column
            df_clean[col] = pd.to_numeric(df_clean[col].str.replace(',', '.'), errors='coerce')
    
    # 3. Handle missing values
    print("\nMissing values before imputation:")
    print(df_clean.isnull().sum())
    
    # 4. Impute missing values with mean for numeric columns
    numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
    if len(numeric_cols) > 0:
        imputer = SimpleImputer(strategy='mean')
        df_clean[numeric_cols] = imputer.fit_transform(df_clean[numeric_cols])
    else:
        print("No numeric columns found for imputation")
    
    # 5. Check for duplicates
    print(f"\nNumber of duplicate rows: {df_clean.duplicated().sum()}")
    
    # 6. Basic statistics
    print("\nBasic Statistics:")
    display(df_clean.describe())
    
    return df_clean

# First, let's load the data correctly
file_path = r'C:\Users\shali\Desktop\aicte internship folder\afa2e701598d20110228.csv'
try:
    # Load with semicolon delimiter and handle decimal commas
    df = pd.read_csv(file_path, sep=';', decimal=',')
    print("Dataset loaded successfully!")
    print("\nFirst few rows of the dataset:")
    display(df.head())
    print("\nDataset Info:")
    display(df.info())
    
    # Now apply preprocessing
    df_clean = preprocess_data(df)
    
except Exception as e:
    print(f"Error: {str(e)}")
    print("Please check the file path and format.")


Missing values before imputation:
id;date;NH4;BSK5;Suspended;O2;NO3;NO2;SO4;PO4;CL    0
dtype: int64


ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: '1;17.02.2000;0.33;2.77;12;12.3;9.5;0.057;154;0.454;289.5'

### 4. Exploratory Data Analysis (EDA)

In [None]:
def perform_eda(df):
    # 1. Check target variable distribution
    if 'Potability' in df.columns:
        plt.figure(figsize=(8, 6))
        sns.countplot(x='Potability', data=df)
        plt.title('Distribution of Water Potability')
        plt.show()
    
    # 2. Correlation matrix
    plt.figure(figsize=(12, 10))
    corr = df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.show()
    
    # 3. Distribution of numerical features
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numerical_cols:
        plt.figure(figsize=(8, 6))
        sns.histplot(data=df, x=col, kde=True)
        plt.title(f'Distribution of {col}')
        plt.show()

# Perform EDA if dataset is loaded and preprocessed
if 'df_clean' in locals():
    perform_eda(df_clean)

### 5. Data Preparation for Modeling

In [None]:
def prepare_data(df, target_col='Potability', test_size=0.2, random_state=42):
    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

# Prepare data if dataset is loaded and preprocessed
if 'df_clean' in locals() and 'Potability' in df_clean.columns:
    X_train, X_test, y_train, y_test, scaler = prepare_data(df_clean)
    print("Data preparation complete!")
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")

### Next Steps
1. Upload your water quality dataset to the 'data' folder
2. Update the file path in the 'Data Loading' section
3. Run each cell sequentially to perform the analysis
4. For Week 2, we'll focus on model building and evaluation