In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

In [3]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/water_potability_prediction/dataset/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/water_potability_prediction/dataset/test.csv')

In [4]:
X_train = df_train.drop('Potability', axis=1)
y_train = df_train['Potability']

In [5]:
X_test = df_test.drop('Potability', axis=1)
y_test = df_test['Potability']

## Using KNN imputer to impute missing values in train and test set

In [6]:
from sklearn.impute import KNNImputer

In [7]:
imputer = KNNImputer(n_neighbors=3, weights="distance")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [8]:
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns)
X_train_imputed.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,10.933111,162.424918,18846.634913,7.085261,349.717899,593.725764,14.977233,60.69058,3.894989
1,6.028101,262.808478,17150.10055,9.37839,380.30135,514.545672,20.637599,76.529077,4.080786
2,5.575921,223.374007,14553.132308,9.689175,234.609808,252.968328,12.249489,59.523003,4.294448
3,6.320428,210.310043,11682.229317,6.968423,311.777822,375.964558,11.973422,62.960463,4.553381
4,7.851926,235.338312,14763.580113,7.741975,338.603322,505.665639,11.86337,49.598381,5.204044


In [9]:
X_test_imputed.isnull().sum()

Unnamed: 0,0
ph,0
Hardness,0
Solids,0
Chloramines,0
Sulfate,0
Conductivity,0
Organic_carbon,0
Trihalomethanes,0
Turbidity,0


## Oversapling the data to remove the imbalance

In [10]:
!pip install imblearn



In [11]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_imputed, y_train = smote.fit_resample(X_train_imputed, y_train)

## Save the data in csv format

In [12]:
train_imputed = pd.concat([X_train_imputed, y_train], axis=1)
test_imputed = pd.concat([X_test_imputed, y_test], axis=1)

In [13]:
train_imputed.shape, test_imputed.shape

((3216, 10), (655, 10))

In [14]:
train_imputed.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,10.933111,162.424918,18846.634913,7.085261,349.717899,593.725764,14.977233,60.69058,3.894989,0
1,6.028101,262.808478,17150.10055,9.37839,380.30135,514.545672,20.637599,76.529077,4.080786,1
2,5.575921,223.374007,14553.132308,9.689175,234.609808,252.968328,12.249489,59.523003,4.294448,0
3,6.320428,210.310043,11682.229317,6.968423,311.777822,375.964558,11.973422,62.960463,4.553381,1
4,7.851926,235.338312,14763.580113,7.741975,338.603322,505.665639,11.86337,49.598381,5.204044,1


In [20]:
train_imputed['Potability'].value_counts()

Unnamed: 0_level_0,count
Potability,Unnamed: 1_level_1
0,1608
1,1608


In [15]:
train_imputed.to_csv('/content/drive/MyDrive/Colab Notebooks/water_potability_prediction/dataset/train_imputed.csv', index=False)
test_imputed.to_csv('/content/drive/MyDrive/Colab Notebooks/water_potability_prediction/dataset/test_imputed.csv', index=False)

### Algorithm like XGBOOST, DecisionTree are sensitive to scaling so we saved data without scaling for them. But Algorithm like logistic regression, SVM require scaling so we again scale the data snd save for them.

In [16]:
from sklearn.preprocessing import StandardScaler
X_train_imputed_scaled = StandardScaler().fit_transform(X_train_imputed)
X_test_imputed_scaled = StandardScaler().fit_transform(X_test_imputed)


In [17]:
train_imputed_scaled = pd.concat([pd.DataFrame(X_train_imputed_scaled, columns=X_train.columns), y_train], axis=1)
test_imputed_scaled = pd.concat([pd.DataFrame(X_test_imputed_scaled, columns=X_test.columns), y_test], axis=1)

In [18]:
train_imputed_scaled.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,2.678349,-1.031922,-0.374221,-0.024,0.420112,2.112333,0.207182,-0.401433,-0.107704,0
1,-0.699865,2.094295,-0.56433,1.451112,1.21889,1.11412,1.966315,0.63697,0.138823,1
2,-1.011294,0.866199,-0.85534,1.651032,-2.586279,-2.183553,-0.640549,-0.477982,0.422321,0
3,-0.498532,0.459351,-1.177046,-0.099159,-0.570806,-0.632955,-0.726345,-0.252615,0.765888,1
4,0.556253,1.2388,-0.831758,0.398447,0.129822,1.00217,-0.760547,-1.128659,1.629224,1


In [19]:
train_imputed_scaled.to_csv('/content/drive/MyDrive/Colab Notebooks/water_potability_prediction/dataset/train_imputed_scaled.csv', index=False)
test_imputed_scaled.to_csv('/content/drive/MyDrive/Colab Notebooks/water_potability_prediction/dataset/test_imputed_scaled.csv', index=False)

In [21]:
train_imputed_scaled['Potability'].value_counts()

Unnamed: 0_level_0,count
Potability,Unnamed: 1_level_1
0,1608
1,1608
