In [22]:
# For this practical example we will need the following libraries and modules
import numpy as np
import pandas as pd

### Import df
Near Miss Undersampled unpreprocessed data.

In [23]:
df1 = pd.read_csv("df1.csv")

In [24]:
df1.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,0.93,20.77,0.02,3.23,0.004,4.0,0.71,1.56,0.63,0.0,...,0.142,6.29,1.62,0.007,46.27,1.77,0.06,0.03,0.01,0
1,0.1,24.03,0.07,0.45,0.05,0.15,0.02,0.57,0.61,0.13,...,0.117,6.74,0.3,0.003,1.74,0.8,0.02,0.07,0.02,0
2,0.3,14.51,0.05,2.01,0.004,0.41,0.36,0.26,1.41,0.34,...,0.05,7.85,1.16,0.003,1.01,0.63,0.03,0.19,0.04,0
3,0.04,6.2,0.06,0.28,0.04,0.59,0.02,0.33,0.82,0.29,...,0.093,1.51,1.0,0.004,1.4,0.03,0.03,0.03,0.05,0
4,0.08,14.36,0.05,1.31,0.05,0.08,0.04,0.26,1.41,0.34,...,0.05,7.85,1.16,0.003,0.33,0.11,0.03,0.06,0.04,0


#### Split into featues and labels and check shape

In [25]:
X = df1.drop(['is_safe'],axis=1)
y = df1["is_safe"]

#### Split into train and test sets.

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=22,train_size=.8)

#### Preprocess data. 

In [28]:
# Import the scaling module
from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()

In [29]:
# fit and transform on the input data at a single time and converts the data points. 
# Scale the features and store them in a new variable (the actual scaling procedure)
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),columns=X.columns,index=X_train.index)

In [30]:
print(f"Shape of X_train_scaled a shape of {X_train_scaled.shape}\n")
print(f"Shape of 'y_train' targets a shape of {y_train.shape}")

Shape of X_train_scaled a shape of (1459, 20)

Shape of 'y_train' targets a shape of (1459,)


In [31]:
df1["is_safe"].value_counts()

0    912
1    912
Name: is_safe, dtype: int64

In [32]:
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns=X.columns,index=X_test.index)

## Build Models

In [33]:
# Models
# import torch
# import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve

# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

print('Packages imported...')

Packages imported...


In [45]:
# instantiating the object and fitting
clf = SVC(kernel='linear', C=1, random_state=42).fit(X_train_scaled,y_train)

# predicting the values
y_pred = clf.predict(X_test_scaled)

# printing the test accuracy
print("The test accuracy score of SVM is ", accuracy_score(y_test, y_pred))

The test accuracy score of SVM is  0.8958904109589041


In [35]:

# # instantiating the object
# svm = SVC()

# # setting a grid - not so extensive
# parameters = {"C":np.arange(1,10,1),'gamma':[0.00001,0.00005, 0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5,1,5]}

# # instantiating the GridSearchCV object
# searcher = GridSearchCV(svm, parameters)

# # fitting the object
# searcher.fit(X_train_scaled, y_train)

# # the scores
# print("The best params are :", searcher.best_params_)
# print("The best score is   :", searcher.best_score_)


In [36]:

# # predicting the values
# y_pred = searcher.predict(X_test_scaled)

# # printing the test accuracy
# print("The test accuracy score of SVM after hyper-parameter tuning is ", accuracy_score(y_test, y_pred))

### Train the model

In [37]:
reg = LogisticRegression()

In [38]:
reg.fit(X_train_scaled,y_train)

LogisticRegression()

In [39]:
reg.score(X_train_scaled,y_train)

0.8752570253598355

### Testing the model

In [40]:
reg.score(X_test_scaled,y_test)

0.8876712328767123

### Train the Gradient Boosting Classifier model

In [43]:
# instantiate the classifier
gbt = GradientBoostingClassifier(n_estimators = 300,max_depth=1,subsample=0.8,max_features=0.2,random_state=22)

# fitting the model
gbt.fit(X_train_scaled,y_train)


GradientBoostingClassifier(max_depth=1, max_features=0.2, n_estimators=300,
                           random_state=22, subsample=0.8)

In [44]:
# predicting values
y_pred = gbt.predict(X_test_scaled)
print("The test accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_test, y_pred))

The test accuracy score of Gradient Boosting Classifier is  0.9095890410958904
