# SKLEARN workshop

## Loading the dataset and declaration of the variables

In [23]:
#download a package
from sklearn.datasets import load_iris

In [24]:
#We will load the iris dataset for this example
iris = load_iris()
X = iris.data
y = iris.target

In [25]:
#first ten data points
X[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [26]:
#first ten target points
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [27]:
#What do these numbers mean? Let's show the feature and target names
features = iris.feature_names
features

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [28]:
targets = iris.target_names
targets

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

## Splitting the Model into the Training and Testing Set

In [29]:
#import the package
from sklearn.model_selection import train_test_split

In [30]:
#We will now be splitting the values into four variables
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
#Some helpful parts of the function
#test_size - the percentage of the data you would like to test
#random_state - can be any number, and it is used to maintain consistency with the randomness

In [31]:
#For example, let's take a look at the X_test data, and it's shape
print(X_test)
print(X_test.shape)

[[5.8 4.  1.2 0.2]
 [5.1 2.5 3.  1.1]
 [6.6 3.  4.4 1.4]
 [5.4 3.9 1.3 0.4]
 [7.9 3.8 6.4 2. ]
 [6.3 3.3 4.7 1.6]
 [6.9 3.1 5.1 2.3]
 [5.1 3.8 1.9 0.4]
 [4.7 3.2 1.6 0.2]
 [6.9 3.2 5.7 2.3]
 [5.6 2.7 4.2 1.3]
 [5.4 3.9 1.7 0.4]
 [7.1 3.  5.9 2.1]
 [6.4 3.2 4.5 1.5]
 [6.  2.9 4.5 1.5]
 [4.4 3.2 1.3 0.2]
 [5.8 2.6 4.  1.2]
 [5.6 3.  4.5 1.5]
 [5.4 3.4 1.5 0.4]
 [5.  3.2 1.2 0.2]
 [5.5 2.6 4.4 1.2]
 [5.4 3.  4.5 1.5]
 [6.7 3.  5.  1.7]
 [5.  3.5 1.3 0.3]
 [7.2 3.2 6.  1.8]
 [5.7 2.8 4.1 1.3]
 [5.5 4.2 1.4 0.2]
 [5.1 3.8 1.5 0.3]
 [6.1 2.8 4.7 1.2]
 [6.3 2.5 5.  1.9]
 [6.1 3.  4.6 1.4]
 [7.7 3.  6.1 2.3]
 [5.6 2.5 3.9 1.1]
 [6.4 2.8 5.6 2.1]
 [5.8 2.8 5.1 2.4]
 [5.3 3.7 1.5 0.2]
 [5.5 2.3 4.  1.3]
 [5.2 3.4 1.4 0.2]
 [6.5 2.8 4.6 1.5]
 [6.7 2.5 5.8 1.8]
 [6.8 3.  5.5 2.1]
 [5.1 3.5 1.4 0.3]
 [6.  2.2 5.  1.5]
 [6.3 2.9 5.6 1.8]
 [6.6 2.9 4.6 1.3]]
(45, 4)


In [32]:
#For a comparison, here is the entire dataset
X.shape

(150, 4)

## Training the Model

In [33]:
#we will be using the KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

#Declare the classifier as a variable
knn_classifier = KNeighborsClassifier(n_neighbors = 3)

In [34]:
#Fit the model using the training data
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [35]:
#Using the model, get the predicted y-values based on the testing data
y_predictions = knn_classifier.predict(X_test)
y_predictions

array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 1, 2,
       1])

In [36]:
#y_test for comparison
y_test

array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2,
       1])

## Measuring Accuracy

In [37]:
#import the metrics package
from sklearn import metrics

In [38]:
#Gets the accuracy score using the associated metrics.
accuracy = metrics.accuracy_score(y_test, y_predictions)
accuracy

0.9777777777777777

## Testing the model

In [39]:
#We will create a sample matrix of 5 different numerical values
sample = [[1,2,3,4], [5,6,5,4], [3,2,1,2], [3,4,5,6], [5,4,3,2]]

#We generate predictions based on the given sample
predictions = knn_classifier.predict(sample)
predictions

array([1, 2, 0, 2, 1])

In [40]:
#Let's make this array more readable
species = []
for i in predictions:
    species.append(targets[i])
    
species

['versicolor', 'virginica', 'setosa', 'virginica', 'versicolor']

## Saving a model for future use

In [44]:
#import the joblib package
import joblib

In [45]:
#Dumps the model into a new file
joblib.dump(knn_classifier, 'iris_classifier_knn.joblib')

['iris_classifier_knn.joblib']

In [46]:
#Let's get the model back
joblib.load('iris_classifier_knn.joblib')

KNeighborsClassifier(n_neighbors=3)