# SKLearn library

In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
# Loading dataset

# Data Preprocessing

In [4]:
# Binarization
# preprocessing technique used to convert numerical value into boolean values.
arr = np.array([[2.1, -1.9, 5.5],[-1.5, 2.4, 3.5],[0.5, -7.9, 5.6],[5.9, 2.3, -5.8]])
# threshold is 0.5 all values above 0.5 will become 1 otherwise 0
data_binarized = preprocessing.Binarizer(threshold=0.5).transform(arr)
data_binarized

array([[1., 0., 1.],
       [0., 1., 1.],
       [0., 0., 1.],
       [1., 1., 0.]])

In [6]:
# Mean removal
# eliminate the so that every feature centre on zero
data_scaled = preprocessing.scale(arr)
print("Mean_removed =", data_scaled.mean(axis=0))
print("Stddeviation_removed =", data_scaled.std(axis=0))

Mean_removed = [1.11022302e-16 0.00000000e+00 0.00000000e+00]
Stddeviation_removed = [1. 1. 1.]


In [7]:
# scaling
# data should not be large or small in magnitude
# minmaxscalar preserve the original shape of data
data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0,1)) # minmaxscalar transform each feature into given range
data_scaled_minmax = data_scaler_minmax.fit_transform(arr)
print ("Min max scaled data:\n", data_scaled_minmax)


Min max scaled data:
 [[0.48648649 0.58252427 0.99122807]
 [0.         1.         0.81578947]
 [0.27027027 0.         1.        ]
 [1.         0.99029126 0.        ]]


In [9]:
# Normalization
# use to modify feature vector
# normalization is necesasry so that feature vectors can be measured at common scale.
# 2 types of normalization
# L1 normalization is also called least absolute deviation
# L1 is calculated in such a way that sum of absolute value always remains upto 1 in each row
data_normalized_l1 = preprocessing.normalize(arr, norm='l1')
print("L1 normalized data:\n", data_normalized_l1)

L1 normalized data:
 [[ 0.22105263 -0.2         0.57894737]
 [-0.2027027   0.32432432  0.47297297]
 [ 0.03571429 -0.56428571  0.4       ]
 [ 0.42142857  0.16428571 -0.41428571]]


In [10]:
# L2 normalization
# L2 also called least square method
# it is calculated such that sum of squares remains upto 1
data_normalized_l2 = preprocessing.normalize(arr, norm='l2')
print("L2 normalized data:\n", data_normalized_l2)

L2 normalized data:
 [[ 0.33946114 -0.30713151  0.88906489]
 [-0.33325106  0.53320169  0.7775858 ]
 [ 0.05156558 -0.81473612  0.57753446]
 [ 0.68706914  0.26784051 -0.6754239 ]]


# Import dataset

In [11]:
import seaborn as sns

In [12]:
dataset = sns.load_dataset('iris')
dataset

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [13]:
x = dataset.drop(['species'], axis = 1)
y = dataset['species']

In [15]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 0)

In [16]:
x_train.shape

(105, 4)

In [17]:
x_test.shape

(45, 4)

In [18]:
y_train.shape

(105,)

In [20]:
y_test.shape

(45,)

# Models

In [21]:
# SVM
# used for classification and regression. task is to find best decision boundry. the best decision boundry is called as hyperplane.
# and extreme cases called as support vectors.
# used for image classification, text classification
# 2 types : 1. linear : means data can be seperate using one straight line. 2. non-linear SVM : data can not seperate usign one straight line.


In [23]:
from sklearn.preprocessing import StandardScaler
st_x= StandardScaler()    
x_train= st_x.fit_transform(x_train)    
x_test= st_x.transform(x_test)  

In [24]:
from sklearn.svm import SVC # "Support vector classifier"  
classifier = SVC(kernel='linear', random_state=0)  
classifier.fit(x_train, y_train) 

SVC(kernel='linear', random_state=0)

In [25]:
y_pred= classifier.predict(x_test) 

In [26]:
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(y_test, y_pred)  

In [27]:
cm

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]], dtype=int64)

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9777777777777777

In [29]:
# NB
# supervised learning algorithm. it is probabilistic classifier. 
# 3 types of NB model. guassianNB, bernoullliNB, multinominalNB.
# Steps to implement:
# Data Pre-processing step
# Fitting Naive Bayes to the Training set
# Predicting the test result
# Test accuracy of the result(Creation of Confusion matrix)
# Visualizing the test set result.

In [30]:
# Feature Scaling  
from sklearn.preprocessing import StandardScaler  
sc = StandardScaler()  
x_train = sc.fit_transform(x_train)  
x_test = sc.transform(x_test)  

In [31]:
# Fitting Naive Bayes to the Training set  
from sklearn.naive_bayes import GaussianNB  
classifier = GaussianNB()  
classifier.fit(x_train, y_train) 

GaussianNB()

In [32]:
# Predicting the Test set results  
y_pred = classifier.predict(x_test)  

In [33]:
# Finding accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

1.0

In [34]:
# Decision Tree
# attribute selection measures : gini index and information gain
# steps : Data Pre-processing step
# Fitting a Decision-Tree algorithm to the Training set
# Predicting the test result
# Test accuracy of the result(Creation of Confusion matrix)
# Visualizing the test set result.
#feature Scaling  
from sklearn.preprocessing import StandardScaler    
st_x= StandardScaler()  
x_train= st_x.fit_transform(x_train)    
x_test= st_x.transform(x_test)

In [36]:
#Fitting Decision Tree classifier to the training set  
from sklearn.tree import DecisionTreeClassifier  
classifier= DecisionTreeClassifier(criterion='entropy', random_state=0)  
classifier.fit(x_train, y_train)  

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [37]:
#Predicting the test set result  
y_pred= classifier.predict(x_test)  

In [38]:
# finding accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9777777777777777

In [41]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train,y_train)

LogisticRegression(random_state=0)

In [42]:
# predict test set result
y_pred = classifier.predict(x_test)

In [43]:
# predict accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9777777777777777

In [47]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 100) 

In [48]:
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(x_train, y_train)

RandomForestClassifier()

In [49]:
# performing predictions on the test dataset
y_pred = clf.predict(x_test)

In [50]:
accuracy_score(y_test,y_pred)

0.9777777777777777