In [1]:
import numpy as np
import pandas as pd
import scipy as sp

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
trainData  = f'https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/train.csv'
testData  = f'https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/test.csv'
subData  = f'https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/submission.csv'

In [3]:
train = pd.read_csv(trainData)
test = pd.read_csv(testData)
sub = pd.read_csv(subData)

In [4]:
train = train.fillna(0)
test = test.fillna(0)

#train = train.fillna(train.median())
#test = test.fillna(test.median())

In [5]:
X = train.drop(['Potability'], axis=1)
y = train['Potability']

In [6]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    pipe = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())])
    param = {'rf__criterion':['gini', 'entropy'],'rf__max_depth':[1, 10, 100, 1000],'rf__random_state':[1, 10, 100, 1000]}
    grid = GridSearchCV(pipe, param, scoring='accuracy', verbose=1, n_jobs=2).fit(X_train, y_train)
    print(grid.best_score_, grid.best_params_)
    print(f1_score(y_test, grid.predict(X_test)), accuracy_score(y_test, grid.predict(X_test)))
    print("\n")

Fitting 5 folds for each of 32 candidates, totalling 160 fits
0.6586709548696417 {'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__random_state': 1000}
0.2944162436548224 0.6463104325699746


Fitting 5 folds for each of 32 candidates, totalling 160 fits
0.6401185211655573 {'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__random_state': 1}
0.40350877192982454 0.6539440203562341


Fitting 5 folds for each of 32 candidates, totalling 160 fits
0.6477539047959382 {'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__random_state': 100}
0.36319612590799033 0.6653944020356234


Fitting 5 folds for each of 32 candidates, totalling 160 fits
0.6477702833489674 {'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__random_state': 10}
0.3282828282828283 0.6615776081424937


Fitting 5 folds for each of 32 candidates, totalling 160 fits
0.6624856687660994 {'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__random_state': 1000}
0.4089068825910931 0.628498727735369




In [7]:
X = StandardScaler().fit_transform(X)
test = StandardScaler().fit_transform(test)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [9]:
rf = RandomForestClassifier(max_depth=100, random_state=2022).fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f1_score(y_test, y_pred), accuracy_score(y_test, y_pred))

0.40085287846481876 0.6424936386768448


In [10]:
dt = DecisionTreeClassifier(max_depth=100, random_state=2022).fit(X_train, y_train)
y_pred = dt.predict(X_test)
print(f1_score(y_test, y_pred), accuracy_score(y_test, y_pred))

0.4788273615635179 0.5928753180661578


In [11]:
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f1_score(y_test, y_pred), accuracy_score(y_test, y_pred))

0.006600660066006601 0.6170483460559797


In [12]:
test_pred = rf.predict(test)
sub[0] = test_pred
sub.to_csv('17033.csv', index=False)