In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
######################################################################################
# Reading Data
######################################################################################

# Reading the CSV file into a DataFrame
df = pd.read_csv("../0000_Data/01-raw/01-Ad.csv")

# Defining feature columns and target column
x_values = df.iloc[:, 3:5].values
y_values = df.iloc[:, -1].values

# ==============================================================================
# Step: Reading and Preparing Data
# ==============================================================================
# - Features (x_values): 2D NumPy array (matrix) containing independent 
#   variables (input data used for prediction).
# - Target (y_values): 1D NumPy array (vector) containing the dependent 
#   variable (what we want to predict).
#
# Note: Using .values converts Pandas DataFrame columns into NumPy arrays,
# which are required for Scikit-learn models.
# ==============================================================================

In [None]:
######################################################################################
# Preprocessing
######################################################################################

# Train Test Split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.25, stratify=y_values, random_state=42)

# ==============================================================================
# Step: Splitting Data into Training and Testing Sets
# ==============================================================================
# - x_train/y_train: Input features and targets used to "teach" the model.
# - x_test/y_test: Unseen input features and targets used to evaluate
#                  model performance ("final exam").
#
# Parameters:
# - test_size=0.25: 75% of data for training, 25% for testing.
# - stratify=y: Crucial for imbalanced data. Ensures the proportion of
#               classes (e.g., 90% No, 10% Yes) is identical in both
#               training and testing sets.
# ==============================================================================

# Sclaling Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# ==============================================================================
# Step: Feature Scaling - Training vs Testing
# ==============================================================================
# Goal: Scale data to have mean 0 and std 1 (Standard Scaling).
#
# Procedure:
# - fit_transform(x_train): Learns parameters (mean/std) from training data.
# - transform(x_test): Applies THOSE SAME parameters to test data.
#
# CRITICAL RULE: NEVER fit on test data.
# Reason: Prevents "Data Leakage" - ensuring the model does not indirectly
# "see" the test data during training, which would make evaluation results
# falsely optimistic.
# ==============================================================================

In [None]:
######################################################################################
# Model Training
######################################################################################

from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=7)
KNN.fit(x_train, y_train)