# **Data manipulation libraries:**

In [225]:
import numpy as np
import pandas as pd

# **Visualization libraries:**


In [226]:
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.image as mpimg
%matplotlib inline
import tabulate


# **ML libraries:**


In [227]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split        #data splitting


#**Load Dataset:**

In [228]:
#Read the housing.csv dataset into an object named df
# df = pd.read_csv('magic04.data')
column_names = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
df = pd.read_csv('magic04.data', names=column_names)

In [229]:
# Shows the number of rows and columns in the data
df.shape

(19020, 11)

In [230]:
#show the first five rows of the data set.
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [231]:
#using .info to explore columns index, columns names, non-Null count values and data type in the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19020 entries, 0 to 19019
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   fLength   19020 non-null  float64
 1   fWidth    19020 non-null  float64
 2   fSize     19020 non-null  float64
 3   fConc     19020 non-null  float64
 4   fConc1    19020 non-null  float64
 5   fAsym     19020 non-null  float64
 6   fM3Long   19020 non-null  float64
 7   fM3Trans  19020 non-null  float64
 8   fAlpha    19020 non-null  float64
 9   fDist     19020 non-null  float64
 10  class     19020 non-null  object 
dtypes: float64(10), object(1)
memory usage: 1.6+ MB


In [232]:
 # Sum total of the null values (if equal zero so no data missed)
df.isnull().sum().sum()
# check for duplicate data and removing them
duplicates_no = df.duplicated().sum()
print("No. of duplicated rows = ", duplicates_no)
df = df.drop_duplicates()




No. of duplicated rows =  115


In [233]:
print(df.shape)


(18905, 11)


In [234]:
#Using .describe() to see the statistics of our values
df.describe()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist
count,18905.0,18905.0,18905.0,18905.0,18905.0,18905.0,18905.0,18905.0,18905.0,18905.0
mean,53.161416,22.145872,2.824643,0.380247,0.21456,-4.177867,10.618826,0.259364,27.551644,193.712554
std,42.259789,18.300664,0.472377,0.182709,0.110384,59.010059,50.900687,20.775268,26.083055,74.685712
min,4.2835,0.0,1.9413,0.0131,0.0003,-457.9161,-331.78,-205.8947,0.0,1.2826
25%,24.3597,11.8742,2.4771,0.2358,0.1285,-20.4791,-12.7693,-10.8358,5.5164,142.269
50%,37.1295,17.1438,2.74,0.354,0.1964,4.0629,15.338,0.75,17.533,191.832
75%,69.9754,24.7124,3.1011,0.5035,0.285,24.1335,35.8694,10.9489,45.704,240.409
max,334.177,256.382,5.3233,0.893,0.6752,575.2407,238.321,179.851,90.0,495.561


# **1-Data Balancing**

In [235]:
#check if data balanced or not
#result: "g" is greater than "h" so we need resample with size of min
g_class = df[df['class'] == 'g']
h_class = df[df['class'] == 'h']

g_count = g_class['class'].count()
h_count = h_class['class'].count()
print(g_count)
print(h_count)


12332
6573


In [236]:
# # Randomly sample 'g' samples to match the count of 'h' samples
# g_class_under = g_class.sample(h_count)
# ds = pd.concat([g_class_under, h_class], axis = 0)
# # Save the balanced dataset to a CSV file named 'balanced magic.csv' without including an index column
# ds.to_csv('balanced magic.csv', index = False)
ds = pd.read_csv('balanced magic.csv')

In [237]:
#check after resampling
g_class = ds[ds['class'] == 'g']
h_class = ds[ds['class'] == 'h']
g_count = g_class['class'].count()
h_count = h_class['class'].count()
print(g_count)
print(h_count)



6573
6573


# **2-Split dataset**

In [238]:
x=ds.drop(columns='class')
y=ds['class']


In [239]:
# Splitting dataset into 70% training and 30% test
X_train_temp, X_test_temp, Y_train_temp, Y_test_temp = train_test_split(x,y,train_size=0.7, random_state =42)
# Splitting dataset into 15% test and 15% validation
X_val, X_test, y_val, y_test = train_test_split(X_test_temp, Y_test_temp, test_size=0.5, random_state=42)

# **3-Apply K-NN Classifier**

In [240]:
# Initialize lists to store evaluation metrics for different k values
k_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]  # List of k values to try
best_k = None
best_accuracy = 0
best_precision = 0
best_recall = 0
best_f1 = 0

accuracy_values = []
precision_values = []
recall_values = []
f1_values = []
confusion_matrices = []

# **4-Apply different k values to get the best results.**

In [241]:
# Iterate over different k values to find best one
for k in k_values:
    # Create a KNN classifier with the current k value
    knn = KNeighborsClassifier(n_neighbors=k)

    # Fit the classifier to the training data
    knn.fit(X_train_temp, Y_train_temp)

    # Make predictions on the test data
    y_pred = knn.predict(X_val)

    # Calculate the evaluation metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')
    confusion_matrix_result = confusion_matrix(y_val, y_pred)

    # Append the metrics to the respective lists
    accuracy_values.append(accuracy)
    precision_values.append(precision)
    recall_values.append(recall)
    f1_values.append(f1)
    confusion_matrices.append(confusion_matrix_result)

    # Check if this k value gives better accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k
        best_precision = precision
        best_recall = recall
        best_f1 = f1

print(f"The best k value is {best_k} with the following metrics")
print(f"Accuracy: {best_accuracy:.9f}")
print(f"Precision: {best_precision:.9f}")
print(f"Recall: {best_recall:.9f}")
print(f"F1 Score: {best_f1:.9f}")

The best k value is 7 with the following metrics
Accuracy: 0.782961460
Precision: 0.791742044
Recall: 0.782961460
F1 Score: 0.781546828


# **5-Report all of your trained model accuracy, precision, recall and f-score as well as confusion matrix.**


In [242]:
# Create a list to store the evaluation metrics
evaluation_data = []

# Iterate over 'k' values and calculate metrics
for i, k in enumerate(k_values):
    knn = KNeighborsClassifier(n_neighbors=k)
    #Fit the classifier to the training data
    knn.fit(X_train_temp, Y_train_temp)

    # Make predictions on the test data
    y_pred = knn.predict(X_val)

    # Calculate the evaluation metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')

    # Calculate the confusion matrix
    confusion_matrix_result = confusion_matrix(y_val, y_pred)

    # Append metrics and confusion matrix to the list
    evaluation_data.append([k, accuracy, precision, recall, f1, confusion_matrix_result])

# Create a DataFrame from the evaluation data
df = pd.DataFrame(evaluation_data, columns=["K Value", "Accuracy", "Precision", "Recall", "F1-Score", "Confusion Matrix"])

# Print the metrics table
print(tabulate.tabulate(df, headers="keys",  tablefmt="grid"))


+----+-----------+------------+-------------+----------+------------+--------------------+
|    |   K Value |   Accuracy |   Precision |   Recall |   F1-Score | Confusion Matrix   |
|  0 |         1 |   0.743915 |    0.746526 | 0.743915 |   0.743409 | [[773 204]         |
|    |           |            |             |          |            |  [301 694]]        |
+----+-----------+------------+-------------+----------+------------+--------------------+
|  1 |         3 |   0.768256 |    0.773985 | 0.768256 |   0.767249 | [[818 159]         |
|    |           |            |             |          |            |  [298 697]]        |
+----+-----------+------------+-------------+----------+------------+--------------------+
|  2 |         5 |   0.773834 |    0.782154 | 0.773834 |   0.772397 | [[837 140]         |
|    |           |            |             |          |            |  [306 689]]        |
+----+-----------+------------+-------------+----------+------------+--------------------+


### Model Comparison
- The performance of the K-NN classifier improves as 'k' increases, indicating that considering more neighbors in the decision-making process benefits the model.
- K-NN with 'k=7' achieves the highest accuracy and F1-Score, making it the preferred choice for this problem.
- Precision and recall remain consistent across different 'k' values, demonstrating balanced performance.
- The confusion matrices reveal a clear understanding of true positives (TP), false positives (FP), false negatives (FN), and true negatives (TN), which can guide further analysis and decision-making.



In [243]:
#after founding best k during validation now time to test
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train_temp, Y_train_temp)

# Make predictions on the test data
y_pred = knn.predict(X_test)

# Calculate the evaluation metrics
accuracy1 = accuracy_score(y_test, y_pred)
precision1 = precision_score(y_test, y_pred, average='weighted')
recall1 = recall_score(y_test, y_pred, average='weighted')
f11 = f1_score(y_test, y_pred, average='weighted')
confusion_matrix_result1 = confusion_matrix(y_test, y_pred)

print(f"The best k value is {best_k} with the following metrics")
print(f"Accuracy: {accuracy1:.9f}")
print(f"Precision: {precision1:.9f}")
print(f"Recall: {recall1:.9f}")
print(f"F1 Score: {f11:.9f}")
print(f"Confusion Matrix: {confusion_matrix_result1}")


The best k value is 7 with the following metrics
Accuracy: 0.765720081
Precision: 0.770981128
Recall: 0.765720081
F1 Score: 0.763281543
Confusion Matrix: [[881 152]
 [310 629]]
