In [38]:
# Importing necessary libraries
 
import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

#Reading dataset and preprocessing
Displaying the first few values and split the dataset into dependent and independent variables.

Converts labels(male/female) into numerical form

y values is the age. it is old(1) if age greater than 9, else young(0)

In [39]:
dataset = pd.read_csv('/content/abalone.csv')
print(dataset.head())
print(dataset.describe())

le = LabelEncoder()
dataset['sex'] = le.fit_transform(dataset['sex'])

X = dataset.iloc[:,:8].values
y = dataset.iloc[:,8].values
y = [1 if i>=9 else 0 for i in y]

  sex  length  diameter  ...  viscera weight  shell weight  rings
0   M   0.455     0.365  ...          0.1010         0.150     15
1   M   0.350     0.265  ...          0.0485         0.070      7
2   F   0.530     0.420  ...          0.1415         0.210      9
3   M   0.440     0.365  ...          0.1140         0.155     10
4   I   0.330     0.255  ...          0.0395         0.055      7

[5 rows x 9 columns]
            length     diameter  ...  shell weight        rings
count  4177.000000  4177.000000  ...   4177.000000  4177.000000
mean      0.523992     0.407881  ...      0.238831     9.933684
std       0.120093     0.099240  ...      0.139203     3.224169
min       0.075000     0.055000  ...      0.001500     1.000000
25%       0.450000     0.350000  ...      0.130000     8.000000
50%       0.545000     0.425000  ...      0.234000     9.000000
75%       0.615000     0.480000  ...      0.329000    11.000000
max       0.815000     0.650000  ...      1.005000    29.000000

[8 ro

Split the variables into training and test sets.

X values are independent variables and y values are the values to be predicted.

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

#Preprocessing Step

Normalising X values to be centered at 0.

In [41]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#Training and predicting
Training the dataset using a K Nearest Neighbours model with 5 neighbours and using Euclidean distance as measure.

Run model on the test set

In [42]:
knn_mod = KNeighborsClassifier(n_neighbors = 61, metric = 'minkowski', p = 2)
knn_mod.fit(X_train, y_train)

y_pred = knn_mod.predict(X_test)

#Evaluation Metrics
**Confusion matrix**

||Predicted 0|Predicted 1|
|---|---|---|
|Actual 0|TN|FP|
|Actual 1|FN|TP|

**Precision, Recall and f1-score**

$Precision = \frac{TP}{TP+FP}$

$Recall = \frac{TN}{TN+FP}$

$f1-score = \frac{2*precision*recall}{precision+recall}$

In [43]:
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

print('\nPrecision')
print(precision_score(y_test, y_pred))
print('\nRecall')
print(recall_score(y_test, y_pred))
print('\nf1-score')
print(f1_score(y_test, y_pred))

Confusion Matrix
[[260 116]
 [ 48 621]]

Precision
0.8426051560379919

Recall
0.9282511210762332

f1-score
0.8833570412517782
