In [1]:
import numpy as np
from math import dist
from scipy.spatial import distance

# Euclidean Distance

In [2]:
x = [12, 9, 14]
y = [15, 13, 3]

In [6]:
print(distance.euclidean(x, y))
print(distance.euclidean(y, x))

12.083045973594572
12.083045973594572


In [33]:
a = np.array(x)
b = np.array(y)

print(np.linalg.norm(a-b))
print(np.linalg.norm(b-a))
print("")

print(np.sqrt(np.sum(np.square(a-b))))
print(np.sqrt(np.sum(np.square(b-a))))
print("")

temp = a-b
print(np.sqrt(np.dot(temp, temp)))

12.083045973594572
12.083045973594572

12.083045973594572
12.083045973594572

12.083045973594572


In [37]:
print(dist(x, y))
print(dist(y, x))

12.08
12.08304597359457


# Manhattan Distance

In [38]:
i = [3, 9, 2]
j = [6, 5, 13]

In [39]:
print(distance.cityblock(i, j))
print(distance.cityblock(j, i))

18
18


In [41]:
k = np.array(i)
l = np.array(j)

print(sum(abs(k-l)))

18


# Error Functions

In [47]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from math import sqrt

In [42]:
actual = [3.88, 4.55, 5.22, 5.88, 6.56, 7.22, 7.89, 8.56]
predicted = [5.72, 6.37, 7.01, 7.66, 8.30, 8.95, 9.60, 10.25]

In [52]:
mae_val = mae(actual, predicted)

print("Mean Absolute Error: ", mae_val)
print(round(mae_val, 2))

Mean Absolute Error:  1.7625000000000002
1.76


In [53]:
mse_val = mse(actual, predicted)

print("Meas Square Error: ", mse_val)
print(round(mse_val, 2))

Meas Square Error:  3.1089
3.11


In [54]:
rmse_val = sqrt(mse(actual, predicted))

print("Root Mean Squared Error: ", rmse_val)
print(round(rmse_val, 2))

Root Mean Squared Error:  1.7632073048850496
1.76


# Normalisation

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("ML_assignment02_dataset_2021.csv")

In [4]:
print(df)
print(df.count())

    Colour  Radius (cm)  Weight (grams) Fruit (class)
0    Green          4.3             122          Pear
1    Green          4.6             152          Pear
2    Green          3.1              85         Apple
3    Green          3.6             173          Pear
4    Green          2.5              65         Lemon
5    Green          2.5              70         Apple
6    Green          2.7              73         Apple
7    Green          4.5             110          Pear
8    Green          2.5              86         Apple
9    Green          2.4              68         Lemon
10   Green          4.2             126          Pear
11     Red          3.7             101         Apple
12     Red          3.7             100         Apple
13     Red          3.4              80         Apple
14  Yellow          3.2              79         Apple
15  Yellow          3.1              69         Lemon
16  Yellow          2.1              64         Apple
17  Yellow          2.6     

In [5]:
# Delete duplicate samples.
def del_duplicate(dataframe):
    dataframe = dataframe.drop_duplicates()
    return dataframe

In [6]:
# Fill missing values (fields that have 0) with the mean of the corresponding class of the attribute
def fill_values(dataframe):
    dataframe.iloc[:, 1:3] = dataframe.groupby(dataframe.iloc[:,3]).transform(lambda x: x.replace(0, x.mean()))
    return dataframe

In [7]:
# Transform nominal attributes (not the classes!) into numerical values. 
def transform_nominal(dataframe): 
    dataframe = pd.concat([dataframe, pd.get_dummies(dataframe.iloc[:,0])], axis=1)
    dataframe = dataframe.drop(df.columns[0], axis=1)
    return dataframe

In [8]:
# Normalise the values of each numerical attribute with min-max method
def normalise_minmax(df):
    df[df.columns[:2]] = (df[df.columns[:2]] - df[df.columns[:2]].min()) / (df[df.columns[:2]].max() - df[df.columns[:2]].min())
    return df

In [9]:
df = del_duplicate(df)

df.to_csv("duplicates_removed_dataset.csv", index=False)

print(df)
print(df.count())

    Colour  Radius (cm)  Weight (grams) Fruit (class)
0    Green          4.3             122          Pear
1    Green          4.6             152          Pear
2    Green          3.1              85         Apple
3    Green          3.6             173          Pear
4    Green          2.5              65         Lemon
5    Green          2.5              70         Apple
6    Green          2.7              73         Apple
7    Green          4.5             110          Pear
8    Green          2.5              86         Apple
9    Green          2.4              68         Lemon
10   Green          4.2             126          Pear
11     Red          3.7             101         Apple
12     Red          3.7             100         Apple
13     Red          3.4              80         Apple
14  Yellow          3.2              79         Apple
15  Yellow          3.1              69         Lemon
16  Yellow          2.1              64         Apple
17  Yellow          2.6     

In [10]:
df = fill_values(df)

df.to_csv("missing_values_filled_dataset.csv", index=False)

print(df)
print(df.count())

    Colour  Radius (cm)  Weight (grams) Fruit (class)
0    Green     4.300000      122.000000          Pear
1    Green     4.600000      152.000000          Pear
2    Green     3.100000       85.000000         Apple
3    Green     3.600000      173.000000          Pear
4    Green     2.500000       65.000000         Lemon
5    Green     2.500000       70.000000         Apple
6    Green     2.700000       73.000000         Apple
7    Green     4.500000      110.000000          Pear
8    Green     2.500000       86.000000         Apple
9    Green     2.400000       68.000000         Lemon
10   Green     4.200000      126.000000          Pear
11     Red     3.700000      101.000000         Apple
12     Red     3.700000      100.000000         Apple
13     Red     3.400000       80.000000         Apple
14  Yellow     3.200000       79.000000         Apple
15  Yellow     3.100000       69.000000         Lemon
16  Yellow     2.100000       64.000000         Apple
17  Yellow     2.600000     

  dataframe.iloc[:, 1:3] = dataframe.groupby(dataframe.iloc[:,3]).transform(lambda x: x.replace(0, x.mean()))


In [11]:
df = transform_nominal(df)

df.to_csv("transformed_dataset.csv", index=False)

print(df)
print(df.count())

    Radius (cm)  Weight (grams) Fruit (class)  Green  Red  Yellow
0      4.300000      122.000000          Pear      1    0       0
1      4.600000      152.000000          Pear      1    0       0
2      3.100000       85.000000         Apple      1    0       0
3      3.600000      173.000000          Pear      1    0       0
4      2.500000       65.000000         Lemon      1    0       0
5      2.500000       70.000000         Apple      1    0       0
6      2.700000       73.000000         Apple      1    0       0
7      4.500000      110.000000          Pear      1    0       0
8      2.500000       86.000000         Apple      1    0       0
9      2.400000       68.000000         Lemon      1    0       0
10     4.200000      126.000000          Pear      1    0       0
11     3.700000      101.000000         Apple      0    1       0
12     3.700000      100.000000         Apple      0    1       0
13     3.400000       80.000000         Apple      0    1       0
14     3.2

In [12]:
df = normalise_minmax(df)

df.to_csv("normalized_dataset.csv", index=False)

print(df)
print(df.count())

    Radius (cm)  Weight (grams) Fruit (class)  Green  Red  Yellow
0      0.709677        0.616541          Pear      1    0       0
1      0.806452        0.842105          Pear      1    0       0
2      0.322581        0.338346         Apple      1    0       0
3      0.483871        1.000000          Pear      1    0       0
4      0.129032        0.187970         Lemon      1    0       0
5      0.129032        0.225564         Apple      1    0       0
6      0.193548        0.248120         Apple      1    0       0
7      0.774194        0.526316          Pear      1    0       0
8      0.129032        0.345865         Apple      1    0       0
9      0.096774        0.210526         Lemon      1    0       0
10     0.677419        0.646617          Pear      1    0       0
11     0.516129        0.458647         Apple      0    1       0
12     0.516129        0.451128         Apple      0    1       0
13     0.419355        0.300752         Apple      0    1       0
14     0.3