In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
from sklearn import tree
import warnings
warnings.simplefilter("ignore")
from sklearn.metrics import accuracy_score

In [7]:
df = pd.read_csv("G:\Projects_data\sample_project1/water_potability.csv")
df.dropna(inplace =True)
df.reset_index(inplace = True, drop = True)
X = df.loc[:, df.columns != 'Potability']
y = df.Potability
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
random_classifier = RandomForestClassifier(n_estimators =50)
random_classifier.fit(X_train,y_train)
y_pred = random_classifier.predict(X_test)
predictions_and_actual = pd.DataFrame(y_pred,y_test)
predictions_and_actual.reset_index(inplace = True)
predictions_and_actual.columns = ['predictions','Original']

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [9]:
forest_params = [{'max_depth': list(range(1, 5)), 'max_features': list(range(0,14))}]
rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, forest_params, cv = 10, scoring='accuracy')

clf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid=[{'max_depth': [1, 2, 3, 4],
                          'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                           12, 13]}],
             scoring='accuracy')

In [10]:
clf.best_params_

{'max_depth': 4, 'max_features': 6}

In [11]:
clf.best_score_

0.6436490683229814

In [12]:
preds = clf.best_estimator_.predict(X_test)
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
print (f'Train Accuracy - : {clf.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {clf.score(X_test,y_test):.3f}')

Train Accuracy - : 0.677
Test Accuracy - : 0.638


In [15]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 5 , stop = 15, num = 10)] # returns 10 numbers 

max_features = ['auto', 'log2']

max_depth = [int(x) for x in np.linspace(5, 10, num = 2)] 

max_depth.append(None)

bootstrap = [True, False]

r_grid = {'n_estimators': n_estimators,

               'max_features': max_features,

               'max_depth': max_depth,

               'bootstrap': bootstrap}

print(r_grid)


{'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 'max_features': ['auto', 'log2'], 'max_depth': [5, 10, None], 'bootstrap': [True, False]}


In [16]:
rfr_random = RandomizedSearchCV(estimator=rfc, param_distributions=r_grid, n_iter = 20,
                                scoring='accuracy', cv = 3, verbose=2, random_state=42, n_jobs=-1, return_train_score=True)

rfr_random.fit(X_train, y_train);

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [17]:
df.corr()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
ph,1.0,0.108948,-0.087615,-0.024768,0.010524,0.014128,0.028375,0.018278,-0.035849,0.01453
Hardness,0.108948,1.0,-0.053269,-0.022685,-0.108521,0.011731,0.013224,-0.0154,-0.034831,-0.001505
Solids,-0.087615,-0.053269,1.0,-0.051789,-0.162769,-0.005198,-0.005484,-0.015668,0.019409,0.040674
Chloramines,-0.024768,-0.022685,-0.051789,1.0,0.006254,-0.028277,-0.023808,0.01499,0.013137,0.020784
Sulfate,0.010524,-0.108521,-0.162769,0.006254,1.0,-0.016192,0.026776,-0.023347,-0.009934,-0.015303
Conductivity,0.014128,0.011731,-0.005198,-0.028277,-0.016192,1.0,0.015647,0.004888,0.012495,-0.015496
Organic_carbon,0.028375,0.013224,-0.005484,-0.023808,0.026776,0.015647,1.0,-0.005667,-0.015428,-0.015567
Trihalomethanes,0.018278,-0.0154,-0.015668,0.01499,-0.023347,0.004888,-0.005667,1.0,-0.020497,0.009244
Turbidity,-0.035849,-0.034831,0.019409,0.013137,-0.009934,0.012495,-0.015428,-0.020497,1.0,0.022682
Potability,0.01453,-0.001505,0.040674,0.020784,-0.015303,-0.015496,-0.015567,0.009244,0.022682,1.0


In [18]:
print(rfr_random.best_params_)

{'n_estimators': 15, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': False}


In [19]:
print (f'Train Accuracy - : {rfr_random.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rfr_random.score(X_test,y_test):.3f}')

Train Accuracy - : 0.838
Test Accuracy - : 0.672
