# Task 1: Balancing the Dataset

In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv('magic04.data')

# Rename the last column "column 10" as 'class'
data.rename(columns={data.columns[10]: 'class'}, inplace=True)

# Calculate the number of samples in each class
# gamma -> signal
# hadrons -> background
num_gamma = (data['class'] == 'g').sum()
num_hadron = (data['class'] == 'h').sum()

# Take random samples for each class (gamma / hadron) with the number of samples = number of hadron (as it's lower than number of samples of gamma, so we took the minimum)
data_gamma = data[data['class'] == 'g'].sample(n=num_hadron, random_state=42)
data_hadron = data[data['class'] == 'h'].sample(n=num_hadron, random_state=42)

# Combine the balanced datasets
balanced_data = pd.concat([data_gamma, data_hadron])

# Now, balanced_data contains an equal number of "gamma" and "hadron" samples
print("Number of gamma samples is",len(data_gamma))
print("Number of hadron samples is",len(data_hadron))
print("Number of balanced samples is",len(balanced_data))



Number of gamma samples is 6688
Number of hadron samples is 6688
Number of balanced samples is 13376


# Task 2: Data Splitting

In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset randomly into training (70%), validation (15%), and testing (15%)
train_data, temp_data = train_test_split(balanced_data, test_size=0.3, random_state=42,stratify=balanced_data['class'])
valid_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42,stratify=temp_data['class'])
# Stratify is used to ensure that the class distribution in the original dataset according to labels: g/h

# Define your features (x) and target (y) for each set
x_train = train_data.drop(columns=['class'])
y_train = train_data['class']

x_valid = valid_data.drop(columns=['class'])
y_valid = valid_data['class']

x_test = test_data.drop(columns=['class'])
y_test = test_data['class']

# Task 3: Applying K-NN Classifier

In [5]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the K-NN Classifier
knn = KNeighborsClassifier()

# Train the model on the training data
knn.fit(x_train, y_train)

# Task 4: Trying Different k Values

In [17]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
best_k = None
best_accuracy = 0

for k in range(1, 10 ,2): # k = 1 , 3 , 5 , 7 , 9
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)

    y_predictions = knn.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_predictions)

    # getting best k & accuracy using validation dataset
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy

    print("K =",k)
    print(classification_report(y_valid, y_predictions))
    print("confusion matrix: \n",confusion_matrix(y_valid, y_predictions))
    print("=====================================================")
print("The best K in this range is",best_k)
print("The best accuracy is",best_accuracy)

K = 1
              precision    recall  f1-score   support

           g       0.72      0.77      0.74      1003
           h       0.75      0.70      0.73      1003

    accuracy                           0.74      2006
   macro avg       0.74      0.74      0.74      2006
weighted avg       0.74      0.74      0.74      2006

confusion matrix: 
 [[772 231]
 [298 705]]
K = 3
              precision    recall  f1-score   support

           g       0.73      0.81      0.77      1003
           h       0.78      0.70      0.74      1003

    accuracy                           0.75      2006
   macro avg       0.76      0.75      0.75      2006
weighted avg       0.76      0.75      0.75      2006

confusion matrix: 
 [[811 192]
 [302 701]]
K = 5
              precision    recall  f1-score   support

           g       0.73      0.83      0.78      1003
           h       0.80      0.70      0.75      1003

    accuracy                           0.76      2006
   macro avg       0.77 