In [88]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [89]:
data=pd.read_csv("magic04.data",header=None)
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g
5,51.624,21.1502,2.9085,0.242,0.134,50.8761,43.1887,9.8145,3.613,238.098,g
6,48.2468,17.3565,3.0332,0.2529,0.1515,8.573,38.0957,10.5868,4.792,219.087,g
7,26.7897,13.7595,2.5521,0.4236,0.2174,29.6339,20.456,-2.9292,0.812,237.134,g
8,96.2327,46.5165,4.154,0.0779,0.039,110.355,85.0486,43.1844,4.854,248.226,g
9,46.7619,15.1993,2.5786,0.3377,0.1913,24.7548,43.8771,-6.6812,7.875,102.251,g


#### **Balancing the data:**
    data is balanced by splitting the gamma and hadron samples then randomly sampling a number of gamma samples equal to that of the hadron samples and then concatinating both of them

In [90]:
gamma_data = data[data[10] == 'g'] 
hadron_data = data[data[10] == 'h']
gamma_data_balanced = gamma_data.sample(n=len(hadron_data), random_state=42)
balanced_data = pd.concat([gamma_data_balanced, hadron_data])
Features = balanced_data.drop([10], axis=1)  
Lables = balanced_data[10] 

#### Standardizing the features:
    Features are standardized with 0 mean and unity standard deviation so that results won't be biased towards large features in distance calculations

In [91]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Features_standardized= scaler.fit_transform(Features)

#### **Splitting the data:**
    the now balanced data is partitioned into three parts 70% for training,
    15% for validation and 15% for testing.

In [92]:
Features_train,Features_temp,Lables_train,Lables_temp=train_test_split(Features_standardized,Lables,test_size=0.3,random_state=42)
Features_valid,Features_test,Lables_valid,Lables_test=train_test_split(Features_temp,Lables_temp,test_size=0.5,random_state=42)

In [93]:
from sklearn.neighbors import KNeighborsClassifier
k_ranges=[1,5,11,15,19,25,35,50,60,75,85,100]
bestF1=0
best_K_F1=0
best_model_F1=None
for k in k_ranges:
    total=0
    print(f"RESULTS FOR K={k}:\n")
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(Features_train,Lables_train)
    predict_valid=knn_model.predict(Features_valid)
    print(f"prediction for k={k} :",predict_valid)

    accuracy=accuracy_score(Lables_valid,predict_valid)#correctly predicted/total 
    print(f"accuracy for k={k} :",accuracy)

    precision= precision_score(Lables_valid,predict_valid,pos_label="g")#tp/tp+fp
    print(f"precision for k={k}:",precision)
    
    recall=recall_score(Lables_valid,predict_valid,pos_label="g")#tp/tp+fn
    print(f"recall for k={k}",recall)

    f1 = f1_score(Lables_valid,predict_valid,pos_label="g")#2*tp/2*tp+fp+fn
    print(f"f1 for k={k}:",f1)

    cm=confusion_matrix(Lables_valid,predict_valid,labels=["g","h"])
    cm_df = pd.DataFrame(cm, index=["Actual Gamma (g)", "Actual Hadron (h)"], columns=["Predicted Gamma (g)", "Predicted Hadron (h)"])
    print("\n",cm_df)  
    
    if bestF1<f1:
        bestF1=f1
        best_K_F1=k
        best_model_F1=knn_model     

    print("---------------------------------------------------------------------------")
    

RESULTS FOR K=1:

prediction for k=1 : ['g' 'g' 'h' ... 'h' 'h' 'g']
accuracy for k=1 : 0.7931206380857427
precision for k=1: 0.7770582793709528
recall for k=1 0.8284023668639053
f1 for k=1: 0.801909307875895

                    Predicted Gamma (g)  Predicted Hadron (h)
Actual Gamma (g)                   840                   174
Actual Hadron (h)                  241                   751
---------------------------------------------------------------------------
RESULTS FOR K=5:

prediction for k=5 : ['g' 'g' 'h' ... 'g' 'h' 'g']
accuracy for k=5 : 0.8180458624127617
precision for k=5: 0.7834061135371179
recall for k=5 0.8846153846153846
f1 for k=5: 0.8309402501157943

                    Predicted Gamma (g)  Predicted Hadron (h)
Actual Gamma (g)                   897                   117
Actual Hadron (h)                  248                   744
---------------------------------------------------------------------------
RESULTS FOR K=11:

prediction for k=11 : ['g' 'g' 'h' ... '

In [94]:
print(f"Test results after tuning K according to F-score:\n")
predict_test=best_model_F1.predict(Features_test)
print(f"prediction for k={best_K_F1} :",predict_test)

accuracy=accuracy_score(Lables_test,predict_test)#correctly predicted/total 
print(f"accuracy for k={best_K_F1} :",accuracy)

precision= precision_score(Lables_test,predict_test,pos_label="g")#tp/tp+fp
print(f"precision for k={best_K_F1}:",precision)
    
recall=recall_score(Lables_test,predict_test,pos_label="g")#tp/tp+fn
print(f"recall for k={best_K_F1}",recall)

f1 = f1_score(Lables_test,predict_test,pos_label="g")#2*tp/2*tp+fp+fn
print(f"f1 for k={best_K_F1}:",f1)

cm=confusion_matrix(Lables_test,predict_test,labels=["g","h"])
cm_df = pd.DataFrame(cm, index=["Actual Gamma (g)", "Actual Hadron (h)"], columns=["Predicted Gamma (g)", "Predicted Hadron (h)"])
print("\n",cm_df) 
print("---------------------------------------------------------------------------")

Test results after tuning K according to F-score:

prediction for k=15 : ['g' 'g' 'h' ... 'h' 'g' 'g']
accuracy for k=15 : 0.8231190832087693
precision for k=15: 0.7796754910333049
recall for k=15 0.903960396039604
f1 for k=15: 0.8372306281522237

                    Predicted Gamma (g)  Predicted Hadron (h)
Actual Gamma (g)                   913                    97
Actual Hadron (h)                  258                   739
---------------------------------------------------------------------------


### **Comments**:

##### Accuracy:
    accuracy is improved by increasing K for lower values of K and peaks at K=15,then proceeds to steadily decline for the larger values of K which suggests that larger K values may be causing underfitting
##### Percision:
    percision remains relatively consisitent for the smaller values of K but shows a slight decrease for the larger values
##### Recall:
    recall consistently increases with the increase of K except for the very large values of K i.e:K>50 
##### F-Score:
    represents a balance of percision and recall by balancing minimizing false positives and false negatives and peaks at K=15
##### Conclusion:
    Choosing F-Score as the deciding factor in this classification problem is because of the importance of identifying gamma particles correctly by minimizing false positives and false negatives.
    15 is chosen for the K value after tuning for F-score.
    This yields the following test results:
* accuracy=82.3%
* percision=77.9%
* recall=90.3%
* F-Score=83.7%

        
