In [1]:
# importing libraries
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

In [2]:
# load Iris.csv
df= pd.read_csv('Iris.csv')
# clean data
df.drop("Id", axis=1, inplace=True)
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
T = True # optional task if T == False
if T == True:
    df['Species'] = df['Species'].map({
    'Iris-setosa': 0, 
    'Iris-versicolor': 1, 
    'Iris-virginica': 1
})
else:
    # optional task
    df['Species'] = df['Species'].map({
    'Iris-setosa': 0, 
    'Iris-versicolor': 1, 
    'Iris-virginica': 2
})

In [4]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,1
146,6.3,2.5,5.0,1.9,1
147,6.5,3.0,5.2,2.0,1
148,6.2,3.4,5.4,2.3,1


In [5]:
if T == True:
    target = {
        'Iris-setosa': 0,
        'Non-Iris-setosa': 1
    }

else:
    # optional task
    target = {
        'Iris-setosa': 0,
        'Iris-versicolor': 1,
        'Iris-virginica': 2
    }


In [6]:
X = df.drop("Species", axis=1).values
sc = MinMaxScaler() 
y = df['Species'].values

X = X.reshape(-1, 4)
X = preprocessing.scale(X) # scale the data so that it is easier to fit

X_train, X_test, y_train, y_test = train_test_split(X, 
                y, test_size=0.20, random_state=42)
# apply scaling
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)


In [7]:
# fit a model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# make predictions on test data
y_pred = log_reg.predict(X_test).reshape(-1,1)

In [8]:
# use score method to get accuracy of model
score = log_reg.score(X_test, y_test)

print('Accuracy: {}'.format(score))


Accuracy: 1.0


In [9]:
classes = list(target)
conf_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(conf_mat, columns=classes, index=classes)
cm =cm_df.to_numpy()
cm_df

Unnamed: 0,Iris-setosa,Non-Iris-setosa
Iris-setosa,10,0
Non-Iris-setosa,0,20


## The confusion matrix for the compulsory task

| True Positives (TP) | False Negatives (FN) |
|---------------------|----------------------|
| 10                  | 0                    |

| False Positives (FP) | True Negatives (TN) |
|----------------------|---------------------|
| 0                    | 20                  |

- **TP:** correctly predicted Iris-setosa.
- **FN:** incorrectly predicted not-Iris-setosa when it was Iris-setosa.
- **FP:** incorrectly predicted Iris-setosa when it was not-Iris-setosa.
- **TN:** correctly predicted not-Iris-setosa.

To calculate metrics the following equations will be used: 
 $$𝐴𝑐𝑐𝑢𝑟𝑎𝑐𝑦 = \frac{TP+TN} {TP+FN+TN + FP}$$
 $$𝑃𝑟𝑒𝑐𝑖𝑠𝑖𝑜𝑛 = \frac{TP} {TP + FP}$$
 $$𝑅𝑒𝑐𝑎𝑙𝑙 = \frac{TP} {TP + 𝐹𝑁}$$


In [10]:
# Here we get the values from confusion matrix
TP = cm[0,0]
TN = cm[1,1]
FP = cm[1,0]
FN = cm[0,1]

In [11]:
# this cell will only print values if compulsory task indicator is True (T = True)
if T == True:
    # calculation of accuracy, precision, and recall
    accuracy = (TP + TN) / (TP+ FN + TN + FP)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    print(f'Accuracy: {accuracy} \nPrecision: {precision}\nRecall: {recall}')


Accuracy: 1.0 
Precision: 1.0
Recall: 1.0


These metrics indicate that the model is very good at identifying if input data is for Iris-setosa or not without any misclassifications.

In [12]:
# average f1 score
av_f1 = f1_score(y_test, y_pred, average='micro')
print(f'Accuracy: {av_f1}')

# f1 score per class
f = f1_score(y_test, y_pred, average=None)
lowest_score = min(f)
hardest_class = classes[list(f).index(lowest_score)]
#print('Hardest class:', hardest_class) # there is no hardest class since the model is perfectly fit.

Accuracy: 1.0


In [13]:
# precision and recall for setosa
prec = precision_score(y_test == classes.index('Iris-setosa'), y_pred == classes.index('Iris-setosa'))
rec = recall_score(y_test == classes.index('Iris-setosa'), y_pred == classes.index('Iris-setosa'))

print('Precision:', prec)
print('Recall:', rec)

Precision: 1.0
Recall: 1.0
