In [1]:
# For this practical example we will need the following libraries and modules
import numpy as np
import pandas as pd

### Import df
Near Miss Undersampled unpreprocessed data.

In [2]:
df3 = pd.read_csv("df3.csv")

In [3]:
df3.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,0.93,20.77,0.02,3.23,0.004,4.0,0.71,1.56,0.63,0.0,...,0.142,6.29,1.62,0.007,46.27,1.77,0.06,0.03,0.01,0
1,0.08,14.36,0.05,1.31,0.05,0.08,0.04,0.26,1.41,0.34,...,0.05,7.85,1.16,0.003,0.33,0.11,0.03,0.06,0.04,0
2,0.03,10.53,0.06,1.79,0.01,0.08,0.01,0.07,1.06,0.0,...,0.011,8.32,0.29,0.007,1.85,0.64,0.02,0.1,0.08,0
3,0.1,24.03,0.07,0.45,0.05,0.15,0.02,0.57,0.61,0.13,...,0.117,6.74,0.3,0.003,1.74,0.8,0.02,0.07,0.02,0
4,0.01,13.48,0.04,0.17,0.04,0.02,0.1,0.26,1.41,0.34,...,0.05,8.18,0.74,0.003,0.3,0.25,0.03,0.06,0.04,0


#### Split into featues and labels and check shape

In [4]:
X = df3.drop(['is_safe'],axis=1)
y = df3["is_safe"]

#### Preprocess data. 

In [5]:
# Import the scaling module
from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()

In [6]:
# fit and transform on the input data at a single time and converts the data points. 
# Scale the features and store them in a new variable (the actual scaling procedure)
X_scaled = pd.DataFrame(scaler.fit_transform(X),columns=X.columns,index=X.index)

In [7]:
print(f"Shape of X_scaled a shape of {X_scaled.shape}\n")
print(f"Shape of 'y' targets a shape of {y.shape}")

Shape of X_scaled a shape of (1408, 20)

Shape of 'y' targets a shape of (1408,)


In [8]:
df3["is_safe"].value_counts()

0    704
1    704
Name: is_safe, dtype: int64

#### Split into train and test sets.

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_val,y_train,y_val = train_test_split(X_scaled,y,random_state=22,train_size=.8)

#### Bring in the test dataset

In [11]:
test_df = pd.read_csv("test_df.csv")

In [12]:
X_test = test_df.drop(['is_safe'],axis=1)
y_test = test_df["is_safe"]

In [13]:
# fit and transform on the input data at a single time and converts the data points. 
# Scale the features and store them in a new variable (the actual scaling procedure)
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns,index=X_test.index)

## Build Models

In [14]:
# Models
# import torch
# import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve

# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

print('Packages imported...')

Packages imported...


In [15]:
# instantiating the object and fitting
clf = SVC(kernel='linear', C=1, random_state=22).fit(X_train,y_train)

# predicting the values
y_pred = clf.predict(X_val)

# printing the test accuracy
print("The test accuracy score of SVM is ", accuracy_score(y_val, y_pred))

The test accuracy score of SVM is  0.8936170212765957


In [26]:
# predicting the values
y_test_pred_svm = clf.predict(X_test_scaled)

# printing the test accuracy

print("The test accuracy score of SVM is ", accuracy_score(y_test, y_test_pred_svm))

The test accuracy score of SVM is  0.62


In [16]:

# # instantiating the object
# svm = SVC()

# # setting a grid - not so extensive
# parameters = {"C":np.arange(1,10,1),'gamma':[0.005,0.01,0.05,0.1]}

# # instantiating the GridSearchCV object
# searcher = GridSearchCV(svm, parameters)

# # fitting the object
# searcher.fit(X_train_scaled, y_train)

# # the scores
# print("The best params are :", searcher.best_params_)
# print("The best score is   :", searcher.best_score_)


In [17]:

# # predicting the values
# y_pred = searcher.predict(X_test_scaled)

# # printing the test accuracy
# print("The test accuracy score of SVM after hyper-parameter tuning is ", accuracy_score(y_test, y_pred))

### Train the Logistic Regression model

In [18]:
reg = LogisticRegression()

In [19]:
reg.fit(X_train,y_train)

LogisticRegression()

In [20]:
reg.score(X_train,y_train)

0.8854351687388987

### Testing the model

In [21]:
#validation testing
reg.score(X_val,y_val)

0.8687943262411347

In [22]:
reg.score(X_test_scaled,y_test)

0.645625

### Train the Gradient Boosting Classifier model

In [23]:
# instantiate the classifier
gbt = GradientBoostingClassifier(n_estimators = 300,max_depth=1,subsample=0.8,max_features=0.2,random_state=42)

# fitting the model
gbt.fit(X_train,y_train)


GradientBoostingClassifier(max_depth=1, max_features=0.2, n_estimators=300,
                           random_state=42, subsample=0.8)

In [24]:
# predicting values with validation set
y_pred = gbt.predict(X_val)
print("The test accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_val, y_pred))

The test accuracy score of Gradient Boosting Classifier is  0.9113475177304965


In [25]:
# predicting values with test set
y_test_pred = gbt.predict(X_test_scaled)
print("The test accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_test, y_test_pred))

The test accuracy score of Gradient Boosting Classifier is  0.67375
