# UFO Exam Assignment - Code Appendix

- Allan Simonsen
- Jean-Poul Leth-Møller
- Nina Lisakowski

This notebook is a part of an experiment for a research paper for the UFO exam.  
The point of the notebook is to work with the iris dataset by training 3 different supervised and 3 different unsupervised algorithms.

In [347]:
#The following libraries and modules are needed.

# import pandas for structuring the data
import pandas as pd

#numpy for splitting the data
import numpy as np

#Iris data set import
from sklearn.datasets import load_iris

#Split method import
from sklearn.model_selection import train_test_split

#Import Naive Bayes
from sklearn.naive_bayes import GaussianNB

#Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

#Import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

#Import for meanshift and bandwidth calculation
from sklearn.cluster import  MeanShift, estimate_bandwidth

#Import for DBSCAN
from sklearn.cluster import DBSCAN

#import for KMeans
from sklearn.cluster import KMeans

#Metrics and processing
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import decomposition
from sklearn import preprocessing
from typing import List

## Iris data set import
Import the iris dataset with labels. Consist of 4 different features and 3 labels

In [348]:
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = pd.read_csv(csv_url, header = None)
col_names = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width','Species']
iris =  pd.read_csv(csv_url, names = col_names)
iris.head(20)

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [349]:
iris.shape

(150, 5)

In [350]:
iris['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

# Preparing the dataset
Splitting the dataset into training and test dataset, also encodes the labels into numbers so we can score the cluster models.

In [309]:
y = iris.filter(['Species'], axis=1).values
X = iris.filter(['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width'], axis=1).values

In [310]:
test_set_size = 0.2
seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=seed, stratify=y)


In [311]:
X_test.shape

(30, 4)

In [312]:
le = preprocessing.LabelEncoder()
le.fit(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa' ])
y_train_transformed = le.transform(y_train.ravel())
y_test_transformed = le.transform(y_test.ravel())

# Naive Bayes

In [313]:
naive_model = GaussianNB()
naive_model.fit(X_train, y_train.ravel())

GaussianNB()

In [314]:
naive_model.score(X_train, y_train)

0.9583333333333334

In [315]:
prediction = naive_model.predict(X_train)
print(confusion_matrix(y_train, prediction))

[[40  0  0]
 [ 0 37  3]
 [ 0  2 38]]


In [344]:
prediction = naive_model.predict(X_test)
print(accuracy_score(y_test, prediction))

0.9666666666666667


In [345]:
print(confusion_matrix(y_test, prediction))

[[10  0  0]
 [ 0 10  0]
 [ 0  1  9]]


In [346]:
print(classification_report(y_test, prediction))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.91      1.00      0.95        10
 Iris-virginica       1.00      0.90      0.95        10

       accuracy                           0.97        30
      macro avg       0.97      0.97      0.97        30
   weighted avg       0.97      0.97      0.97        30



# Decision Tree

In [318]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train.ravel())

DecisionTreeClassifier()

In [319]:
tree_model.score(X_train, y_train)

1.0

In [320]:
prediction = tree_model.predict(X_train)
print(confusion_matrix(y_train, prediction))

[[40  0  0]
 [ 0 40  0]
 [ 0  0 40]]


In [340]:
prediction = tree_model.predict(X_test)
print(accuracy_score(y_test, prediction))

0.9666666666666667


In [341]:
print(confusion_matrix(y_test, prediction_tree))

[[10  0  0]
 [ 0 10  0]
 [ 0  1  9]]


In [342]:
print(classification_report(y_test, prediction))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.91      1.00      0.95        10
 Iris-virginica       1.00      0.90      0.95        10

       accuracy                           0.97        30
      macro avg       0.97      0.97      0.97        30
   weighted avg       0.97      0.97      0.97        30



# RandomForest

In [323]:
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train.ravel())

RandomForestClassifier()

In [254]:
forest_model.score(X_train, y_train)

1.0

In [255]:
prediction = forest_model.predict(X_train)
print(confusion_matrix(y_train, prediction))

[[40  0  0]
 [ 0 40  0]
 [ 0  0 40]]


In [337]:
prediction = forest_model.predict(X_test)
print(accuracy_score(y_test, prediction))

0.9666666666666667


In [338]:
print(confusion_matrix(y_test, prediction))

[[10  0  0]
 [ 0 10  0]
 [ 0  1  9]]


In [339]:
print(classification_report(y_test, prediction))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.91      1.00      0.95        10
 Iris-virginica       1.00      0.90      0.95        10

       accuracy                           0.97        30
      macro avg       0.97      0.97      0.97        30
   weighted avg       0.97      0.97      0.97        30



# Meanshift

In [382]:
bandwidth = estimate_bandwidth(X_train, quantile=0.2, n_samples=300)
bandwidth

0.8989162948462586

In [383]:
msmodel = MeanShift(bandwidth=bandwidth, bin_seeding=True)
msmodel.fit(X_train)
labels = msmodel.labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
n_clusters_

3

Good sign that it found 3 clusters

In [384]:
prediction = msmodel.predict(X_train)
print(accuracy_score(y_train_transformed, prediction))

0.13333333333333333


In [385]:
print(confusion_matrix(y_train_transformed, prediction))

[[ 0 40  0]
 [40  0  0]
 [24  0 16]]


In [386]:
prediction = msmodel.predict(X_test)
print(accuracy_score(y_test_transformed, prediction))

0.13333333333333333


In [387]:
print(confusion_matrix(y_test_transformed, prediction))

[[ 0 10  0]
 [ 8  2  0]
 [ 8  0  2]]


In [336]:
print(classification_report(y_test_transformed, prediction))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.17      0.20      0.18        10
           2       1.00      0.20      0.33        10

    accuracy                           0.13        30
   macro avg       0.39      0.13      0.17        30
weighted avg       0.39      0.13      0.17        30



# DBScan

In [264]:
db = DBSCAN(eps=0.3, min_samples=5).fit(X_train)
labels = db.labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
n_clusters_

3

In [265]:
#https://stackoverflow.com/a/51516334/17384594
def dbscan_predict(model, X):

    nr_samples = X.shape[0]

    y_new = np.ones(shape=nr_samples, dtype=int) * -1

    for i in range(nr_samples):
        diff = model.components_ - X[i, :]  # NumPy broadcasting

        dist = np.linalg.norm(diff, axis=1)  # Euclidean distance

        shortest_dist_idx = np.argmin(dist)

        if dist[shortest_dist_idx] < model.eps:
            y_new[i] = model.labels_[model.core_sample_indices_[shortest_dist_idx]]

    return y_new

In [266]:
prediction = dbscan_predict(db, X_train)
print(accuracy_score(y_train_transformed, prediction))

0.3416666666666667


In [267]:
print(confusion_matrix(y_train_transformed, prediction))

[[ 0  0  0  0]
 [11 29  0  0]
 [28  0 12  0]
 [40  0  0  0]]


In [331]:
prediction = dbscan_predict(db, X_test)
print(accuracy_score(y_test_transformed, prediction))

0.23333333333333334


In [332]:
print(confusion_matrix(y_test_transformed, prediction))

[[ 0  0  0  0]
 [ 3  7  0  0]
 [10  0  0  0]
 [10  0  0  0]]


In [333]:
print(classification_report(y_test_transformed, prediction))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       1.00      0.70      0.82        10
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00        10

    accuracy                           0.23        30
   macro avg       0.25      0.17      0.21        30
weighted avg       0.33      0.23      0.27        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# K-Means

In [298]:
kmeans = KMeans().fit(X_train)
labels = kmeans.labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
n_clusters_

8

Bad sign that it found 8 clusters. We force it to only use 3 instead.

In [299]:
kmeans = KMeans(n_clusters=3).fit(X_train)
labels = kmeans.labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
n_clusters_

3

In [300]:
prediction = kmeans.predict(X_train)
print(accuracy_score(y_train_transformed, prediction))

0.43333333333333335


In [301]:
print(confusion_matrix(y_train_transformed, prediction))

[[40  0  0]
 [ 0  2 38]
 [ 0 30 10]]


In [328]:
prediction = kmeans.predict(X_test)
print(accuracy_score(y_test_transformed, prediction))

0.4666666666666667


In [329]:
print(confusion_matrix(y_test_transformed, prediction))

[[10  0  0]
 [ 0  0 10]
 [ 0  6  4]]


In [330]:
print(classification_report(y_test_transformed, prediction))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.00      0.00      0.00        10
           2       0.29      0.40      0.33        10

    accuracy                           0.47        30
   macro avg       0.43      0.47      0.44        30
weighted avg       0.43      0.47      0.44        30



# Result summary

### Naive Bayes: 97%
### Decision Tree: 97%
### Random Forest: 97%
### Mean Shift: 13%
### DBSCAN: 23%
### K-Means: 47%