In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
data=pd.read_csv("../input/water-potability/water_potability.csv")

In [5]:
data.shape

In [6]:
data.head(10)

In [7]:
data.info()

# Missing Values

In [8]:
df=data.copy()

In [9]:
df.isnull().sum()

In [10]:
for i in range(len(df)):
    if pd.isnull(df['ph'][i])==True:
        if ((df['Hardness'][i]<=150) & (df['Potability'][i]==0)):
            df['ph'][i]=df[(df['Potability']==0) & (df['Hardness']<=150)]['ph'].mean()
            
        if ((df['Hardness'][i]>150) & (df['Potability'][i]==0)):
            df['ph'][i]=df[(df['Potability']==0) & (df['Hardness']>150)]['ph'].mean()
            
        if ((df['Hardness'][i]<=150) & (df['Potability'][i]==1)):
            df['ph'][i]=df[(df['Potability']==1) & (df['Hardness']<=150)]['ph'].mean()
            
        if ((df['Hardness'][i]>150) & (df['Potability'][i]==1)):
            df['ph'][i]=df[(df['Potability']==1) & (df['Hardness']>150)]['ph'].mean()

In [11]:
for i in range (0,len(df)):
    if (pd.isnull(df['Sulfate'][i]) == True):
        if (df['Potability'][i]==0):
            df['Sulfate'][i]=df[(df['Potability']==0)][['Sulfate']].mean()
        else:
             df['Sulfate'][i]=df[(df['Potability']==1)][['Sulfate']].mean()

In [12]:
df['Trihalomethanes'].fillna(value = df['Trihalomethanes'].mean(), inplace = True)

In [13]:
df.isnull().sum()

In [14]:
df.head(10)

# EDA

In [15]:
wp=df.copy()

In [16]:
wp.Potability = wp.Potability.replace([0, 1], ['No', 'Yes'])

In [17]:
plt.figure(figsize=(5,4))
sns.histplot(x='Potability', data=wp)
plt.title('Potabiliy distribution')
plt.show()

In [18]:
fig, ax = plt.subplots(3, 3, figsize=(25,15))
row, col = 0, 0

for i in range(len(wp.columns[:-1])):
    sns.histplot(wp, x=wp.columns[i], hue="Potability", element="poly", stat="count",  
                  palette='magma', ax=ax[row][col])
    col+=1
    if col==3:
        row+=1
        col=0

In [19]:
plt.scatter(df.ph, df.Hardness, c=df.Potability, cmap='prism')

# Model

In [20]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [21]:
X = df.drop(['Potability'], axis = 1)
Y = df['Potability']

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=65)

In [23]:
models=[LogisticRegression(), SVC(), LinearSVC(), RandomForestClassifier(), KNeighborsClassifier(), 
        GaussianNB(), SGDClassifier(), DecisionTreeClassifier(), AdaBoostClassifier(), GradientBoostingClassifier()]

In [24]:
results={}

In [25]:
for model in models:
    m=model.fit(X_train, Y_train)
    Y_pred=m.predict(X_test)
    results[type(model).__name__]=accuracy_score(Y_test, Y_pred)

In [26]:
results=pd.Series(results)

In [27]:
results.sort_values(ascending=False)

# Model Tuning

In [28]:
def tuning(model, params, X_train=X_train, X_test=X_test, Y_train=Y_train, Y_test=Y_test):
    clf=GridSearchCV(model, params, cv=5, return_train_score=True)
    clf.fit(X_train, Y_train)
    Y_test_pred=clf.predict(X_test)
    acc_score=accuracy_score(Y_test, Y_test_pred)
    print(type(model).__name__)
    print(f'Accuracy Score: {np.round(acc_score*100,2)}%')
    print(f'Best params: {clf.best_params_}')

In [29]:
params = {'n_estimators': [50, 60, 70, 80, 90, 95, 98, 99, 100, 102], 'criterion': ['entropy'], 'min_samples_leaf':[50, 70, 90, 110, 130, 150, 170, 190]}
tuning(RandomForestClassifier(), params)

In [30]:
params = {'learning_rate': [0.1, 0.2, 0.3, 0.01], 'n_estimators': list(range(95, 102)), 'criterion': ['friedman_mse', 'mse', 'mae']}
tuning(GradientBoostingClassifier(), params)

In [31]:
params = {'n_estimators': list(range(40,60)), 'algorithm': ['SAMME', 'SAMME.R']}
tuning(AdaBoostClassifier(), params)

In [32]:
params = {'criterion': ['gini','entropy'], 'splitter': ['best', 'random']}
tuning(DecisionTreeClassifier(), params)

In [33]:
l=[[1,2,3,4,5],
   [6,7,8,9,10],
   [11,12,13,14,15],
   [16,17,18,19,20],
   [21,22,23,24,25]]


In [67]:
row = 5
col = 5
for i in range(0, row*col):
    row = i//5
    col = i%5
    #print(row, col)
    print(l[row][col-1], end=" ")

5 5 5 5 5 10 10 10 10 10 15 15 15 15 15 20 20 20 20 20 25 25 25 25 25 