In [1]:
# For this practical example we will need the following libraries and modules
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

### Import df
Near Miss Undersampled unpreprocessed data.

In [2]:
df4 = pd.read_csv("df4.csv")

In [3]:
df4.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,0.08,0.06,0.01,1.39,0.08,0.32,0.0,1.15,0.53,0.66,...,0.088,8.69,1.86,0.001,0.04,3.44,0.0,0.06,0.02,0
1,0.06,29.11,0.09,1.56,0.08,0.1,0.08,1.19,0.38,0.84,...,0.064,7.23,1.0,0.005,2.84,2.4,0.07,0.04,0.06,0
2,0.1,23.9,0.02,2.77,0.003,1.07,0.54,1.35,1.03,0.04,...,0.067,13.92,1.31,0.0,12.23,2.42,0.05,0.23,0.0,0
3,4.46,28.08,0.02,3.93,0.004,5.1,0.38,0.52,0.22,0.69,...,0.064,12.59,1.37,0.004,47.56,6.63,0.03,0.14,0.07,0
4,4.41,26.35,0.95,4.14,0.02,8.01,0.55,1.62,1.48,0.99,...,0.162,1.81,1.41,0.003,23.1,7.87,0.02,0.31,0.04,0


#### Split into featues and labels and check shape

In [4]:
X = df4.drop(['is_safe'],axis=1)
y = df4["is_safe"]

#### Preprocess data. 

In [5]:
# Import the scaling module
from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()

In [6]:
# fit and transform on the input data at a single time and converts the data points. 
# Scale the features and store them in a new variable (the actual scaling procedure)
X_scaled = pd.DataFrame(scaler.fit_transform(X),columns=X.columns,index=X.index)

In [7]:
print(f"Shape of X_scaled a shape of {X_scaled.shape}\n")
print(f"Shape of 'y' targets a shape of {y.shape}")

Shape of X_scaled a shape of (11384, 20)

Shape of 'y' targets a shape of (11384,)


In [8]:
df4["is_safe"].value_counts()

0    5692
1    5692
Name: is_safe, dtype: int64

#### Split into train and test sets.

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_val,y_train,y_val = train_test_split(X_scaled,y,random_state=22,train_size=.8)

#### Bring in the test dataset

In [11]:
test_df = pd.read_csv("test_df.csv")

In [12]:
X_test = test_df.drop(['is_safe'],axis=1)
y_test = test_df["is_safe"]

In [13]:
# fit and transform on the input data at a single time and converts the data points. 
# Scale the features and store them in a new variable (the actual scaling procedure)
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns,index=X_test.index)

## Build Models

In [14]:
# Models
# import torch
# import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve

# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

print('Packages imported...')

Packages imported...


In [15]:
# instantiating the object and fitting
clf = SVC(kernel='linear', C=1, random_state=22).fit(X_train,y_train)

# predicting the values
y_pred = clf.predict(X_val)

# printing the test accuracy
print("The test accuracy score of SVM is ", accuracy_score(y_val, y_pred))

The test accuracy score of SVM is  0.8001756697408872


In [16]:

# # instantiating the object
# svm = SVC()

# # setting a grid - not so extensive
# parameters = {"C":np.arange(1,10,1),'gamma':[0.005,0.01,0.05,0.1]}

# # instantiating the GridSearchCV object
# searcher = GridSearchCV(svm, parameters)

# # fitting the object
# searcher.fit(X_train_scaled, y_train)

# # the scores
# print("The best params are :", searcher.best_params_)
# print("The best score is   :", searcher.best_score_)


In [17]:

# # predicting the values
# y_pred = searcher.predict(X_test_scaled)

# # printing the test accuracy
# print("The test accuracy score of SVM after hyper-parameter tuning is ", accuracy_score(y_test, y_pred))

### Train the Logistic Regression model

In [18]:
reg = LogisticRegression()

In [19]:
reg.fit(X_train,y_train)

LogisticRegression()

In [20]:
reg.score(X_train,y_train)

0.8003733391896344

### Testing the model

In [21]:
#validation testing
reg.score(X_val,y_val)

0.7944664031620553

In [22]:
reg.score(X_test_scaled,y_test)

0.79875

### Train the Gradient Boosting Classifier model

In [23]:
# instantiate the classifier
gbt = GradientBoostingClassifier(n_estimators = 300,max_depth=1,subsample=0.8,max_features=0.2,random_state=42)

# fitting the model
gbt.fit(X_train,y_train)


GradientBoostingClassifier(max_depth=1, max_features=0.2, n_estimators=300,
                           random_state=42, subsample=0.8)

In [24]:
# predicting values with validation set
y_val_pred = gbt.predict(X_val)
print("The validation accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_val, y_val_pred))

The test accuracy score of Gradient Boosting Classifier is  0.8840579710144928


In [25]:
# predicting values with test set
y_test_pred = gbt.predict(X_test_scaled)
print("The test accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_test, y_test_pred))

The test accuracy score of Gradient Boosting Classifier is  0.888125
