In [1]:
# For this practical example we will need the following libraries and modules
import numpy as np
import pandas as pd

### Import df
Near Miss Undersampled unpreprocessed data.

In [2]:
df4 = pd.read_csv("df4.csv")

In [3]:
df4.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,-0.463465,-1.601443,-0.599632,-0.146304,1.03189,-0.723628,-0.913738,0.526444,-0.555002,1.032811,...,-0.196525,-0.203776,0.924846,-1.413097,-0.928625,0.223835,-1.726831,-0.611672,-0.91704,0
1,-0.479272,1.670552,-0.282946,-0.006519,1.03189,-0.80933,-0.618149,0.587648,-0.899516,1.579133,...,-0.609145,-0.467236,-0.575412,-0.064914,-0.770323,-0.223927,0.706129,-0.750986,0.56971,0
2,-0.447657,1.083733,-0.560046,0.988423,-1.104207,-0.431464,1.081485,0.832463,0.593377,-0.848965,...,-0.557568,0.73999,-0.034621,-1.750143,-0.239447,-0.215316,0.010997,0.572505,-1.660415,0
3,2.998318,1.55454,-0.560046,1.942252,-1.076465,1.138432,0.490308,-0.437516,-1.266997,1.123865,...,-0.609145,0.499988,0.070048,-0.40196,1.757984,1.597261,-0.684134,-0.054412,0.941398,0
4,2.9588,1.359684,3.121421,2.114928,-0.632601,2.272029,1.118433,1.245589,1.626919,2.034401,...,1.075721,-1.445288,0.139827,-0.739005,0.375104,2.131132,-1.0317,1.129764,-0.173665,0


#### Split into featues and labels and check shape

In [4]:
X = df4.drop(['is_safe'],axis=1)
y = df4["is_safe"]

#### Preprocess data. 

In [5]:
# Import the scaling module
from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()

In [6]:
# fit and transform on the input data at a single time and converts the data points. 
# Scale the features and store them in a new variable (the actual scaling procedure)
X_scaled = pd.DataFrame(scaler.fit_transform(X),columns=X.columns,index=X.index)

In [7]:
print(f"Shape of X_scaled a shape of {X_scaled.shape}\n")
print(f"Shape of 'y' targets a shape of {y.shape}")

Shape of X_scaled a shape of (11384, 20)

Shape of 'y' targets a shape of (11384,)


In [8]:
df4["is_safe"].value_counts()

0    5692
1    5692
Name: is_safe, dtype: int64

#### Split into train and test sets.

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_val,y_train,y_val = train_test_split(X,y,random_state=22,train_size=.8)

#### Bring in the test dataset

In [11]:
test_df = pd.read_csv("test_df.csv")

In [12]:
test_df.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,-0.51879,-1.394198,-0.560046,-0.138081,1.309305,-0.824912,-0.655098,0.388735,0.708215,-0.97037,...,1.539919,-0.481672,-1.081313,0.609178,-0.697957,-1.158201,1.748826,-0.890301,-1.660415,0
1,2.373933,0.849456,-0.203775,-0.499878,-0.632601,-0.022434,-0.913738,1.016075,1.075697,1.548782,...,-1.26246,-1.5698,1.884312,1.283269,1.582721,1.605872,1.748826,0.224217,1.684773,0
2,-0.51879,-0.490879,-0.480875,-0.804117,0.754475,-0.801539,-0.765943,-1.14136,-0.463132,-0.97037,...,-1.314038,-1.237768,-2.058224,0.609178,-0.906576,-1.072093,-0.684134,-0.890301,0.56971,0
3,-0.502983,-0.087652,-0.441289,-0.639664,-0.632601,-0.844389,-0.83984,-1.171962,1.23647,-0.97037,...,-0.196525,-0.059414,-1.674438,1.620315,-0.435628,-0.779325,1.748826,-0.472357,0.56971,0
4,-0.502983,-0.375994,-0.243361,-0.976793,-0.077771,-0.77427,-0.655098,-1.202564,0.731183,-0.97037,...,0.886603,0.969164,-0.732415,0.272132,-0.612587,-0.985984,0.706129,-0.472357,-1.288728,0


In [13]:
X_test = test_df.drop(['is_safe'],axis=1)
y_test = test_df["is_safe"]

## Build Models

In [14]:
# Models
# import torch
# import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve

# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

print('Packages imported...')

Packages imported...


In [15]:
# instantiating the object and fitting
clf = SVC(kernel='linear', C=1, random_state=22).fit(X_train,y_train)

# predicting the values
y_pred_val = clf.predict(X_val)

# printing the test accuracy
print("The test accuracy score of SVM is ", accuracy_score(y_val, y_pred_val))

The test accuracy score of SVM is  0.7992973210364515


In [16]:
# predicting the values
y_test_pred_svm = clf.predict(X_test)

# printing the test accuracy

print("The test accuracy score of SVM is ", accuracy_score(y_test, y_test_pred_svm))

The test accuracy score of SVM is  0.79625


In [17]:

# # instantiating the object
# svm = SVC()

# # setting a grid - not so extensive
# parameters = {"C":np.arange(1,10,1),'gamma':[0.005,0.01,0.05,0.1]}

# # instantiating the GridSearchCV object
# searcher = GridSearchCV(svm, parameters)

# # fitting the object
# searcher.fit(X_train_scaled, y_train)

# # the scores
# print("The best params are :", searcher.best_params_)
# print("The best score is   :", searcher.best_score_)


In [18]:

# # predicting the values
# y_pred = searcher.predict(X_test_scaled)

# # printing the test accuracy
# print("The test accuracy score of SVM after hyper-parameter tuning is ", accuracy_score(y_test, y_pred))

### Train the Logistic Regression model

In [19]:
reg = LogisticRegression()

In [20]:
reg.fit(X_train,y_train)

LogisticRegression()

In [21]:
reg.score(X_train,y_train)

0.7965301416492808

### Testing the model

In [22]:
#validation testing
reg.score(X_val,y_val)

0.7834870443566095

In [23]:
reg.score(X_test,y_test)

0.798125

### Train the Gradient Boosting Classifier model

In [24]:
# instantiate the classifier
gbt = GradientBoostingClassifier(n_estimators = 300,max_depth=1,subsample=0.8,max_features=0.2,random_state=42)

# fitting the model
gbt.fit(X_train,y_train)


GradientBoostingClassifier(max_depth=1, max_features=0.2, n_estimators=300,
                           random_state=42, subsample=0.8)

In [25]:
# predicting values with validation set
y_pred_gbt = gbt.predict(X_val)
print("The test accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_val, y_pred_gbt))

The test accuracy score of Gradient Boosting Classifier is  0.8884497145366711


In [26]:
# predicting values with test set
y_test_pred = gbt.predict(X_test)
print("The test accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_test, y_test_pred))

The test accuracy score of Gradient Boosting Classifier is  0.888125


In [None]:
# from sklearn.model_selection import train_test_split

In [None]:
# X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,random_state=22,train_size=.8)

In [4]:
def funcc():
    x = 5
    return (x)

funcc()

5