In [88]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV, train_test_split
from scipy import stats
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [89]:
def remove_outliers(data: pd.DataFrame, ZSCORE_THREASHOLD: int = 4) -> pd.DataFrame:
    zscore = np.abs(stats.zscore(data.select_dtypes(include=["float", "int"])))
    is_inlier = ~ (zscore > ZSCORE_THREASHOLD).any(axis=1)
    data = data[is_inlier]
    return data

In [90]:
def pipe(X, y, n_splits: int, scoring: str, n_iter = None):

    pre_processing = Pipeline([
                    ('scaler', StandardScaler()),
                    ('pca', PCA())
                    ])
    
    
    main = Pipeline(steps=[('pre_processing', pre_processing),
                          ('knn', KNeighborsClassifier())
                        ])
                    
    param_grid = {'knn__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31],
                'knn__weights': ['uniform', 'distance'],
                'knn__metric': ['euclidean', 'manhattan', 'minkowski'],
                'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                'knn__leaf_size': [10, 15, 20, 25, 30, 35, 40, 45],
                'knn__p': [3, 4],
                'pre_processing__pca__n_components': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True)
    
    if n_iter is None:
        grid = GridSearchCV(main, param_grid, cv=cv, scoring=scoring, n_jobs=4, verbose=3)
    else:
        grid = RandomizedSearchCV(main, param_grid, cv=cv, scoring=scoring, n_jobs=4, verbose=3, n_iter=n_iter)

    grid.fit(X, y)

    print(grid.best_params_)
    print(grid.best_score_)
    return grid

In [91]:
def get_grids(X, y) -> dict:
    grids = {}
    for splits in range(2, 16):
        for scoring in ['accuracy']:  # , 'precision', 'recall', 'f1'
            print(f'splits: {splits}, scoring: {scoring}')
            grids[(splits, scoring)] = pipe(X, y, splits, scoring, 100)
            print('----------------------------------------')
    return grids

In [92]:
def find_best_model(grids, scoring: str) -> tuple[int, str]:
    best_score = 0
    best_model = None
    best_key = (0, 'foo')   # To avoid PyLance warning
    for keys in grids.keys():
        if scoring in keys:
            if grids[keys].best_score_ > best_score:
                best_score = grids[keys].best_score_
                best_model = grids[keys].best_params_
                best_key = keys
    return best_key

In [93]:
data = pd.read_csv('project_train.csv')

In [94]:
data

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,Label
0,0.545,0.884,5,-4.807,0,0.3670,0.290000,0.000000,0.3700,0.641,86.049,1
1,0.795,0.545,7,-8.153,1,0.3430,0.003960,0.000000,0.2730,0.809,91.967,1
2,0.489,0.871,5,-5.825,1,0.3860,0.002850,0.000004,0.1300,0.341,117.431,1
3,0.539,0.931,4,-1.803,0,0.2620,0.000713,0.000000,0.2040,0.685,85.571,0
4,0.918,0.734,11,-2.832,0,0.2690,0.029400,0.000008,0.1910,0.608,97.044,1
...,...,...,...,...,...,...,...,...,...,...,...,...
500,0.897,0.612,11,-10.489,1,0.2110,0.029700,0.186000,0.0894,0.767,102.305,1
501,0.728,0.454,0,-9.281,1,0.0278,0.512000,0.000001,0.0831,0.323,130.368,0
502,0.571,0.837,0,-5.604,1,0.0377,0.165000,0.000005,0.1700,0.713,141.660,1
503,0.582,0.720,10,-9.722,0,0.2310,0.012700,0.035300,0.3630,0.541,89.273,1


In [95]:
data.describe()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,Label
count,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0
mean,0.53396,2.084912,5.150495,-21.746783,0.667327,0.104032,0.316158,0.186913,0.212621,0.473778,114.0456,0.50099
std,0.188161,32.635821,3.665736,290.809581,0.471638,0.113225,0.370039,0.344825,0.200281,0.268268,27.313063,0.500495
min,0.0849,0.00979,0.0,-6542.0,0.0,0.0226,2.4e-05,0.0,0.0224,0.0242,44.777,0.0
25%,0.371,0.43,1.0,-9.844,0.0,0.0361,0.0201,0.0,0.0934,0.237,92.163,0.0
50%,0.557,0.734,5.0,-5.852,1.0,0.0467,0.116,4.1e-05,0.134,0.474,113.122,1.0
75%,0.686,0.875,8.0,-4.436,1.0,0.135,0.594,0.0857,0.265,0.699,130.008,1.0
max,0.94,734.0,11.0,4.331,1.0,0.925,0.996,0.994,0.992,0.969,210.752,1.0


In [96]:
data.drop_duplicates(inplace=True)

In [97]:
data.describe()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,Label
count,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0
mean,0.534739,2.102339,5.168337,-21.903176,0.667335,0.104115,0.316099,0.185827,0.213416,0.474078,113.963415,0.498998
std,0.187984,32.831427,3.658038,292.551358,0.471641,0.113547,0.369128,0.344095,0.201025,0.267963,27.402673,0.500501
min,0.0849,0.00979,0.0,-6542.0,0.0,0.0226,2.4e-05,0.0,0.0224,0.0242,44.777,0.0
25%,0.3745,0.432,1.0,-9.8135,0.0,0.03585,0.02115,0.0,0.0941,0.2375,92.0895,0.0
50%,0.558,0.733,5.0,-5.878,1.0,0.0465,0.117,4.1e-05,0.134,0.474,112.966,0.0
75%,0.686,0.875,8.0,-4.483,1.0,0.136,0.5935,0.0843,0.266,0.6995,130.005,1.0
max,0.94,734.0,11.0,4.331,1.0,0.925,0.996,0.994,0.992,0.969,210.752,1.0


In [98]:
data.shape

(499, 12)

In [99]:
data = remove_outliers(data)

In [100]:
data.describe()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,Label
count,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0
mean,0.533945,0.632989,5.195565,-8.809054,0.667339,0.102651,0.315484,0.186951,0.212649,0.471984,113.816567,0.497984
std,0.188211,0.303152,3.652045,7.093201,0.471642,0.107689,0.369593,0.344831,0.200337,0.267291,27.385609,0.500501
min,0.0849,0.00979,0.0,-36.252,0.0,0.0226,2.4e-05,0.0,0.0224,0.0242,44.777,0.0
25%,0.37025,0.433,1.0,-9.73725,0.0,0.0358,0.020325,0.0,0.0942,0.23675,92.01575,0.0
50%,0.5565,0.733,5.0,-5.865,1.0,0.04645,0.116,4.2e-05,0.134,0.471,112.433,0.0
75%,0.686,0.875,8.0,-4.46275,1.0,0.1355,0.59325,0.085775,0.26125,0.69325,129.9945,1.0
max,0.94,0.997,11.0,4.331,1.0,0.517,0.996,0.994,0.992,0.969,210.752,1.0


In [101]:
data.shape

(496, 12)

In [62]:
X = data.drop(columns=["Label"])
y = data["Label"]

In [63]:
grids = get_grids(X, y)

splits: 2, scoring: accuracy
Fitting 2 folds for each of 100 candidates, totalling 200 fits
{'pre_processing__pca__n_components': 10, 'knn__weights': 'distance', 'knn__p': 4, 'knn__n_neighbors': 9, 'knn__metric': 'euclidean', 'knn__leaf_size': 35, 'knn__algorithm': 'ball_tree'}
0.810483870967742
----------------------------------------
splits: 3, scoring: accuracy
Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'pre_processing__pca__n_components': 10, 'knn__weights': 'uniform', 'knn__p': 4, 'knn__n_neighbors': 13, 'knn__metric': 'minkowski', 'knn__leaf_size': 30, 'knn__algorithm': 'auto'}
0.8144943410003651
----------------------------------------
splits: 4, scoring: accuracy
Fitting 4 folds for each of 100 candidates, totalling 400 fits
{'pre_processing__pca__n_components': 8, 'knn__weights': 'distance', 'knn__p': 3, 'knn__n_neighbors': 13, 'knn__metric': 'manhattan', 'knn__leaf_size': 35, 'knn__algorithm': 'ball_tree'}
0.8125
----------------------------------------
s

In [64]:
best_key = find_best_model(grids, 'accuracy')

In [65]:
best_model = grids[best_key]

In [67]:
print('The best model in terms of highest accuracy score is:')
print()

for param in best_model.best_params_:
    print(f'{param}: {best_model.best_params_[param]}')
print()

print(f'Number of folds: {best_key[0]}')
print(f'Highest accuracy: {best_model.best_score_}')

The best model in terms of highest accuracy score is:

pre_processing__pca__n_components: 9
knn__weights: distance
knn__p: 3
knn__n_neighbors: 7
knn__metric: euclidean
knn__leaf_size: 10
knn__algorithm: brute

Number of folds: 13
Highest accuracy: 0.83053046818229


In [69]:
X_test = pd.read_csv('project_test.csv')

In [70]:
predictions = best_model.predict(X_test)

In [71]:
predictions

array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0], dtype=int64)