In [22]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Dataset

blog_data = pd.read_csv("./Dataset/blogdata-final.csv")
print(blog_data.shape)
energy_data = pd.read_csv("./Dataset/energydata-final.csv")
energy_data.drop(columns=["date"], inplace=True) # to eliminate time-series nature of the data
print(energy_data.shape)
musk_data = pd.read_csv("./Dataset/muskdata-final.csv")
print(musk_data.shape)
student_data = pd.read_csv("./Dataset/student-final.csv")
print(student_data.shape)
superconductivity_data = pd.read_csv("./Dataset/superconductivity-final.csv")
print(superconductivity_data.shape)

(52397, 281)
(19735, 29)
(476, 169)
(649, 31)
(21263, 81)


In [35]:
def train_test_split_pieces(data):
    X = data.drop(columns="target")
    y = data["target"]

    if y.nunique() == 3:
        label1, label2, label3 = tuple(y.unique())
        y = y.map({label1: 0, label2: 1, label3: 2})
    elif y.unique() == 2:
        label1, label2 = tuple(y.unique())
        y = y.map({label1: 0, label2: 1})

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=6969, shuffle=True, stratify=y)

    return X_train, X_test, y_train, y_test

# kNN Algorithm

In [41]:
X_train, X_test, y_train, y_test = train_test_split_pieces(student_data)
X = pd.concat([X_train, X_test], ignore_index=True)
X.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,M,17,U,GT3,T,2,3,other,other,...,yes,yes,no,5,2,2,1,1,2,2
1,GP,M,16,R,GT3,A,4,4,other,teacher,...,yes,yes,yes,2,4,3,1,1,5,4
2,MS,M,17,R,LE3,T,2,2,services,services,...,yes,no,no,1,3,5,3,5,3,2
3,GP,M,15,U,LE3,T,1,2,other,at_home,...,yes,yes,no,4,3,2,1,1,5,0
4,GP,F,16,U,GT3,T,3,1,services,other,...,yes,yes,no,4,3,3,1,2,5,0


In [50]:
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import numpy as np

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns),
    ]
)

trials = []
for k in (3, 4, 5, 6, 7):
    model = make_pipeline(preprocessor, KNeighborsClassifier(n_neighbors=k))
    scores = np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy"))
    trials.append([k, scores])

model = make_pipeline(preprocessor, KNeighborsClassifier(n_neighbors=4))
model.fit(X_train, y_train)
model.predict(X_test)

array([2, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2,
       0, 2, 1, 2, 2, 2, 2, 1, 0, 0, 1, 2, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2,
       0, 2, 2, 1, 0, 0, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0, 1, 1, 2, 0, 1, 0,
       2, 1, 0, 0, 1, 2, 0, 2, 2, 0, 2, 1, 1, 2, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 2, 1, 1, 0, 2, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 2, 1, 1, 1,
       2, 2, 1, 0, 2, 1, 2, 0, 1, 0, 2, 1, 0, 1, 1, 2, 1, 2, 2, 0, 1, 2,
       2, 1, 1, 0, 2, 2, 1, 0, 0, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 0, 2, 0,
       0, 2, 0, 1, 1, 0, 0, 2, 1, 2, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1,
       2, 1, 2, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 1, 0, 2, 0, 1, 1, 1, 0, 1,
       1, 2, 2, 2, 2, 2, 0, 1, 1, 1, 1, 0, 0, 2, 1, 2, 0, 2, 2, 1, 0, 1,
       0, 1, 2, 0, 1, 2, 2, 2, 1, 1, 2, 0, 2, 0, 0, 1, 1, 2, 0, 1, 0, 2,
       1, 0, 1, 1, 1, 1, 0, 1, 2, 2, 1, 1, 2, 0, 1, 0, 0, 0, 1, 2, 1, 0,
       0, 1, 0, 2, 2, 1, 0, 0, 1, 0, 2, 1, 2, 2, 2, 1, 2, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 2, 1, 1, 0, 1, 1, 1, 1, 1, 0, 2,