In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,classification_report,accuracy_score

#Loading Data

In [None]:
column_names=["x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","label"]
df = pd.read_csv("/content/magic04.data",names=column_names)

##Data Preprocessing

In [None]:
# Counting number of g's and h's in the dataset
g_count = df['label'].value_counts()['g']
h_count = df['label'].value_counts()['h']
print(f'Sum of gammas= {g_count} / Sum of hadrons= {h_count}')

Sum of gammas= 12332 / Sum of hadrons= 6688


In [None]:
# getting a random sample from g that's the size of h
gamma_data = df[df['label'] == 'g'].sample(h_count, replace=False)
hadron_data = df[df['label'] == 'h']

# Counting g's and h's
g_count = gamma_data['label'].value_counts()['g']
h_count = hadron_data['label'].value_counts()['h']
print(f'Sum of new gammas= {g_count} / Sum of hadrons= {h_count}')

Sum of new gammas= 6688 / Sum of hadrons= 6688


In [None]:
# Concatinating the data back together
data=[gamma_data,hadron_data]
telescope_data = pd.concat(data)
print(f"Shape: {telescope_data.shape}")
# Displaying the shape of the data
telescope_data.head()

Shape: (13376, 11)


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,label
48,63.2886,26.427,3.4434,0.2342,0.1273,-27.3566,54.6506,19.7834,0.9753,302.897,g
3589,20.4669,12.8241,2.3971,0.5451,0.2826,-11.1968,-5.9458,7.7056,26.0698,194.077,g
5436,30.1806,12.909,2.4771,0.3833,0.195,11.203,20.5384,-5.1373,11.423,86.461,g
11145,32.7548,20.0059,2.8558,0.3331,0.1735,4.8915,-31.9813,-11.4819,2.2496,221.213,g
12212,19.7327,7.409,2.1021,0.7194,0.4229,0.4781,-5.253,-5.0183,89.3153,98.6568,g


In [None]:
# Shuffle the DataFrame
telescope_data = telescope_data.sample(frac=1, random_state=42)  # Setting random_state for reproducibility

# Reset the index of the shuffled DataFrame
telescope_data.reset_index(drop=True, inplace=True)
telescope_data.shape

(13376, 11)

In [None]:
# Splitting the data to a feature matrix and a label vector
features=telescope_data.drop(['label'],axis=1)
label=telescope_data['label']
features.shape,label.shape

((13376, 10), (13376,))

##Spliting the data 70-15-15

In [None]:
# Split the data into a training set (70%) and a temporary combined validation/test set (30%)
X_train,x_val_test,Y_train,y_val_test = train_test_split(features,label, test_size=0.3, random_state=42)

# Split the temporary validation/test set into validation (15%) and test (15%)
X_val,X_test,Y_val,Y_test = train_test_split(x_val_test, y_val_test, test_size=0.5, random_state=42)


#Building the model

In [None]:
# Building the model
knn=KNeighborsClassifier()

# Setting the values to search for K in
param_grid={'n_neighbors':np.arange(1,115)}

# Finding the best K
grid_search=GridSearchCV(knn,param_grid,cv=5)
grid_search.fit(X_train,Y_train)


# Get the top 3 sets of hyperparameters and their scores
results = pd.DataFrame(grid_search.cv_results_).sort_values(by='mean_test_score', ascending=False)
top_3_results = results[['params', 'mean_test_score']].head(3)

# Print the top 3 sets of hyperparameters and their scores
for index, row in top_3_results.iterrows():
    print(f"Hyperparameters: {row['params']}, Mean Test Score: {row['mean_test_score']}")

Hyperparameters: {'n_neighbors': 9}, Mean Test Score: 0.7761409891348492
Hyperparameters: {'n_neighbors': 7}, Mean Test Score: 0.7761400764804396
Hyperparameters: {'n_neighbors': 13}, Mean Test Score: 0.776034094487111


In [None]:
# Fitting the model to the best K
best_k = grid_search.best_params_['n_neighbors']
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train, Y_train)
accuracy = best_knn.score(X_val, Y_val)

# Displaying the best accuracy
print(f"Accuracy on Validation Set with Best k={best_k}: {accuracy:.4f}")

Accuracy on Validation Set with Best k=9: 0.7742


#Testing model

In [None]:
# Testing the model
y_pred=best_knn.predict(X_test)

# Printing the confusion matrix
conf_mat=confusion_matrix(Y_test,y_pred)
print(conf_mat)

[[829 145]
 [343 690]]


##Model Report

In [None]:
# Printing classification report
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           g       0.71      0.85      0.77       974
           h       0.83      0.67      0.74      1033

    accuracy                           0.76      2007
   macro avg       0.77      0.76      0.76      2007
weighted avg       0.77      0.76      0.76      2007

