In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

In [284]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,6.150553,212.819377,15017.747281,5.544436,319.494952,279.447066,12.850506,75.028793,4.810537,0
1,,180.680598,8692.538152,8.671793,396.528754,534.339086,10.299787,87.652015,3.913602,0
2,5.344117,198.379144,13492.840835,6.559265,328.648845,591.363369,14.084388,61.392983,4.104699,1
3,7.647872,160.774353,29000.589712,7.217409,,438.800961,13.182501,67.099969,3.078673,1
4,8.129586,177.898967,27834.217696,9.006008,334.438539,397.021428,15.936262,81.390758,3.362537,0


In [285]:
print(df.shape)

for col in df.columns:
    df[col].astype(np.float64)
    print(col, "--", df[col].isnull().sum())

(2620, 10)
ph -- 386
Hardness -- 0
Solids -- 0
Chloramines -- 0
Sulfate -- 621
Conductivity -- 0
Organic_carbon -- 0
Trihalomethanes -- 125
Turbidity -- 0
Potability -- 0


In [286]:
df.corr()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
ph,1.0,0.108426,-0.104319,-0.031725,-0.005919,0.025069,0.045838,0.00446,-0.020666,-0.018887
Hardness,0.108426,1.0,-0.068997,-0.029451,-0.11941,-0.015325,0.0115,-0.025106,-0.006441,-0.008904
Solids,-0.104319,-0.068997,1.0,-0.072097,-0.165942,0.01445,0.007705,-0.004758,0.012661,0.03632
Chloramines,-0.031725,-0.029451,-0.072097,1.0,0.03242,-0.018341,-0.017692,0.015644,-0.00682,0.016667
Sulfate,-0.005919,-0.11941,-0.165942,0.03242,1.0,-0.020949,0.060243,-0.024933,-0.005274,-0.037177
Conductivity,0.025069,-0.015325,0.01445,-0.018341,-0.020949,1.0,0.040572,9.6e-05,0.004164,-0.008071
Organic_carbon,0.045838,0.0115,0.007705,-0.017692,0.060243,0.040572,1.0,-0.006994,-0.008384,-0.024479
Trihalomethanes,0.00446,-0.025106,-0.004758,0.015644,-0.024933,9.6e-05,-0.006994,1.0,-0.034581,0.009666
Turbidity,-0.020666,-0.006441,0.012661,-0.00682,-0.005274,0.004164,-0.008384,-0.034581,1.0,0.003172
Potability,-0.018887,-0.008904,0.03632,0.016667,-0.037177,-0.008071,-0.024479,0.009666,0.003172,1.0


In [441]:
X_cols = ["ph", "Solids", "Chloramines", "Sulfate", "Conductivity", "Organic_carbon", "Trihalomethanes", "Turbidity"]
Y_col = df.columns[-1]

In [442]:
from sklearn.impute import SimpleImputer

med_imputer = SimpleImputer(strategy="median", add_indicator=True)

In [443]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df[X_cols], df[Y_col], random_state = 45, test_size = 0.1)

In [444]:
from sklearn.preprocessing import normalize

X_train_impute = med_imputer.fit_transform(X_train)
X_test_impute = med_imputer.transform(X_test)

# X_train_impute = normalize(med_imputer.fit_transform(X_train))
# X_test_impute = normalize(med_imputer.transform(X_test))

In [445]:
from sklearn.metrics import accuracy_score

def cal_acc(model):
    Y_pred = model.predict(X_test_impute)
    print(accuracy_score(Y_test, Y_pred))

In [446]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

svm = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma="auto", degree=3, C=100))
svm.fit(X_train_impute, Y_train)
cal_acc(svm)

0.6221374045801527


In [447]:
log_res = make_pipeline(StandardScaler(), LogisticRegression())
log_res.fit(X_train_impute, Y_train)
cal_acc(log_res)

0.5763358778625954


In [448]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB, ComplementNB

x = make_pipeline(StandardScaler(), GaussianNB())
x.fit(X_train_impute, Y_train)
cal_acc(x)

0.6106870229007634


In [449]:
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

dtr = make_pipeline(StandardScaler(), DecisionTreeClassifier(criterion="gini", random_state= 48, splitter="random", min_samples_split=10, max_depth=12))
dtr.fit(X_train_impute, Y_train)
cal_acc(dtr)

0.5801526717557252


In [450]:
etr = make_pipeline(StandardScaler(), ExtraTreeClassifier(criterion="gini", random_state= 48, splitter="random", min_samples_split=10, max_depth=8))
etr.fit(X_train_impute, Y_train)
cal_acc(etr)

0.5916030534351145


In [451]:
from sklearn.ensemble import RandomForestClassifier

rfr = make_pipeline(StandardScaler(), RandomForestClassifier(min_samples_split=10))
rfr.fit(X_train_impute, Y_train)
cal_acc(rfr)

0.5916030534351145


In [452]:
import tensorflow as tf

model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')
    # tf.keras.layers.SoftMax()
])
model.compile(optimizer=tf.keras.optimizers.SGD(),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])
model.fit(X_train_impute, Y_train)



<keras.src.callbacks.History at 0x235ef45bd30>

In [453]:
model.evaluate(X_test_impute, Y_test)



[nan, 0.572519063949585]

In [454]:
count = 0
for i in model.predict(X_test_impute):
    print(i)
    if i > 0.4175:
        count += 1

print(count)

[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan

In [455]:
test = pd.read_csv("test.csv")

test_impute = med_imputer.transform(test[X_cols])

def make_sol(model, file_name) :
    test["Potability"] = pd.DataFrame(model.predict(test_impute))
    test[["id", "Potability"]].to_csv(f"{file_name}.csv", index=False)

make_sol(svm, "svm_sub")

In [456]:
make_sol(x, "nb_sub")
make_sol(rfr, "rfr_sub")

In [457]:
with open("nn_sub.csv", "w") as f:
    k = 1
    f.write("id,Potability\n")
    for i in model.predict(test_impute):
        # print(i)
        f.write(str(k) + "," + ("1" if i > 0.4175 else "0") + "\n")
        k += 1



In [464]:
from sklearn.neural_network import MLPClassifier

mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(256, ), activation="relu", solver="adam", learning_rate="invscaling", power_t=0.5, max_iter=400))
mlp.fit(X_train_impute, Y_train)

cal_acc(mlp)

0.6030534351145038




In [463]:
make_sol(mlp, "mlp_sub")