## 0. Импорт библиотек, определение функций

In [1]:
import pandas as pd

import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')

import numpy as np
from scipy.stats import kstest, spearmanr, mannwhitneyu, levene, kruskal

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.metrics import r2_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler,  PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, KFold, StratifiedKFold
import statsmodels.api as sm

import operator

from datetime import datetime

from collections import Counter

from matplotlib.colors import ListedColormap 

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report, auc, classification_report, precision_recall_curve, auc, classification_report
from sklearn.utils import shuffle
from sklearn.metrics import RocCurveDisplay, roc_curve, roc_auc_score

In [2]:
def get_spearman(data_1, data_2):
    correlation = spearmanr(data_1, data_2)
    return '{0:.2f}\n{1:.2f}'.format(*correlation)

In [3]:
def conf_matrix(fact, pred):
    conf_test = confusion_matrix(fact, pred, normalize='true')
    sns.heatmap(conf_test, annot=True, fmt='.2f', cmap='binary')
    plt.xlabel('Predicted', size=14)
    plt.ylabel('Actual', size=14)
    plt.show()

## 1. Импорт данных и предобработка

In [4]:
ds = pd.read_csv('data/spotify_data.csv', index_col='Unnamed: 0')
ds.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


Создание новых признаков: 
- возраст трека;
- длительность в минутах;
- популярность, как качественный показатель: если исходная популярность больше 50, то трек считается популярным.

In [5]:
len(ds)

1159764

In [6]:
ds['age'] = (datetime.today().year - ds['year']).astype('int')

In [7]:
ds['duration_min'] = ds['duration_ms']//1000//60

In [8]:
ds['popularity'] = ds['popularity'].apply(lambda x: 1 if x>=50 else 0)

Разделение признаков на группы.

In [9]:
categorial_data = ['popularity','artist_name', 'track_name', 'track_id', 'year', 'genre', 'key', 'mode','time_signature']
metric = ['danceability', 'energy', 'loudness','speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_min', 'age' ]

В качестве неинформативных категориальных признаков были выделены track_id, track_name и artist_name, поэтому они исключаются из дальнейшего рассмотрения

Обнаружена высокая достоверная корреляция между признаками:
energy и acousticness (-0,73)   
energy и loudness (0,73)   
energy и speechiness (0,36)   
loudness и acousticness (-0,57)   
danceability и valence (0,51)   
instrumentalness и valence (-0,35)   

   Выберем age, liveness, acousticness и valence

# Лучшая модель (из предыдущих)

In [10]:
from sklearn import svm
from sklearn.inspection import DecisionBoundaryDisplay
from pactools.grid_search import GridSearchCVProgressBar

In [19]:
ds_new = ds[['popularity', 'age', 'liveness']]

In [12]:
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(
    ds_new.drop('popularity', axis =1), 
    ds_new['popularity'], 
    test_size=0.2, 
    random_state=21)

In [13]:
np.random.seed(666)
new_rand_index = np.random.choice(new_X_train.index,10000)
new_X_train_learn = new_X_train.loc[new_rand_index]
new_y_train_learn = new_y_train.loc[new_rand_index]

In [14]:
grid = {'C': [0.001, 0.1, 10, 100, 1000], 'kernel':['poly'], 'degree': [2,3]} #полином со степенями 2,3
model_svm = svm.SVC(probability = True, class_weight = 'balanced')
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
gs = GridSearchCVProgressBar(model_svm, grid, cv = skf, scoring = 'roc_auc', verbose=3)

In [15]:
gs.best_params_

AttributeError: 'GridSearchCVProgressBar' object has no attribute 'best_params_'

In [None]:
new_model = gs.best_estimator_ # удалил валидацию, чтобы не занимала места
new_model

In [16]:
new_model = svm.SVC(C = 1000, probability = True, class_weight = 'balanced',  degree =  2, kernel= 'poly')


In [17]:
new_model.fit(new_X_train_learn, new_y_train_learn)

In [18]:
y_approx_new = new_model.predict(new_X_train_learn)
y_pred_new = new_model.predict(new_X_test)

In [20]:
print(classification_report(new_y_test, y_pred_new))

              precision    recall  f1-score   support

           0       0.98      0.58      0.73    221444
           1       0.08      0.76      0.14     10509

    accuracy                           0.59    231953
   macro avg       0.53      0.67      0.44    231953
weighted avg       0.94      0.59      0.71    231953



In [None]:
probabilities_svc_new = new_model.predict_proba(new_X_train_learn)
pr_new, rec_new, th_new = precision_recall_curve(new_y_train_learn, probabilities_svc_new[:,1])
mod_n_auc = auc(rec_new[:-1], pr_new[:-1])

In [None]:
fpr_new, tpr_new, th_new = roc_curve(new_y_train_learn, probabilities_svc_new[:,1])
mod_new_roc_auc = auc(fpr_new[:-1], tpr_new[:-1])

# Метод ближайших соседей

In [36]:
from sklearn.neighbors import KNeighborsClassifier

1. Провести классификация с помощью метода ближайших соседей по 2-м признакам. Подобрать оптимальное значения гиперпараметров: количество соседей, функция оценки расстояния, функция для оценки влияния $i$-го соседа. 
Оценить качество полученной модели. Сравнить с лучшим результатом, достигнутым ранее.

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    ds_new.drop('popularity', axis =1), 
    ds_new['popularity'], 
    test_size=0.2, 
    random_state=21)

Т.к. нет class weight, то будем даунсэмплить обучаюущую выборку

In [31]:
train_data = pd.concat([X_train, y_train], axis = 1)
neg_class = train_data[train_data['popularity'] == 0]
pos_class = train_data[train_data['popularity'] == 1]

In [32]:
neg_downsampled = neg_class.sample(frac=0.05, random_state = 1) 
downsampled_data = np.concatenate((neg_downsampled, pos_class))

In [34]:
from pactools.grid_search import GridSearchCVProgressBar

In [45]:
grid = {'n_neighbors': [20, 70, 120, 200, 300, 500], 'weights':['distance', 'uniform', None]}
model_kn = KNeighborsClassifier()
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

In [46]:
gs = GridSearchCVProgressBar(model_kn, grid, cv = skf, scoring = 'roc_auc', verbose=3)

In [48]:
gs.fit(X_train, y_train) #забыл сделать валидацию по урезанным данным

Fitting 4 folds for each of 18 candidates, totalling 72 fits
[CV 1/4] END ..n_neighbors=20, weights=distance;, score=0.634 total time=   7.0s
[CV 2/4] END ..n_neighbors=20, weights=distance;, score=0.636 total time=   6.9s
[CV 3/4] END ..n_neighbors=20, weights=distance;, score=0.632 total time=   6.8s
[CV 4/4] END ..n_neighbors=20, weights=distance;, score=0.641 total time=   6.9s
[CV 1/4] END ...n_neighbors=20, weights=uniform;, score=0.643 total time=   6.8s
[CV 2/4] END ...n_neighbors=20, weights=uniform;, score=0.645 total time=   6.8s
[CV 3/4] END ...n_neighbors=20, weights=uniform;, score=0.641 total time=   7.0s
[CV 4/4] END ...n_neighbors=20, weights=uniform;, score=0.651 total time=   6.9s
[CV 1/4] END ......n_neighbors=20, weights=None;, score=0.643 total time=   6.8s
[CV 2/4] END ......n_neighbors=20, weights=None;, score=0.645 total time=   6.8s
[CV 3/4] END ......n_neighbors=20, weights=None;, score=0.641 total time=   6.8s
[CV 4/4] END ......n_neighbors=20, weights=None;

In [51]:
model = KNeighborsClassifier(n_neighbors=500)
model.fit(downsampled_data[:, :-1], downsampled_data[:, -1])

In [52]:
knn_pred_1 = model.predict(X_test)
target_names = new_y_train.unique()
print(classification_report(y_test, knn_pred_1, labels=target_names))



              precision    recall  f1-score   support

           0       0.98      0.69      0.81    221444
           1       0.09      0.68      0.16     10509

    accuracy                           0.69    231953
   macro avg       0.54      0.68      0.49    231953
weighted avg       0.94      0.69      0.78    231953



Сравнивая с моделью опорных векторов, метрика precision немного увеличилась, recall для класса 0 увеличился, но в то же время recall для класса 1 снизился.

# Улучшение модели

In [53]:
ds_n = ds[['popularity', 'age', 'liveness', 'tempo', 'mode', 'danceability']]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    ds_n.drop('popularity', axis =1), 
    ds_n['popularity'], 
    test_size=0.2, 
    random_state=21)

In [55]:
train_data = pd.concat([X_train, y_train], axis = 1)
neg_class = train_data[train_data['popularity'] == 0]
pos_class = train_data[train_data['popularity'] == 1]
neg_downsampled = neg_class.sample(frac=0.05, random_state = 1) 
downsampled_data = np.concatenate((neg_downsampled, pos_class))

In [56]:
new_model = KNeighborsClassifier(n_neighbors=500)
new_model.fit(downsampled_data[:, :-1], downsampled_data[:, -1])

In [58]:
knn_pred_1 = new_model.predict(X_test)
target_names = new_y_train.unique()
print(classification_report(y_test, knn_pred_1, labels=target_names))



              precision    recall  f1-score   support

           0       0.98      0.67      0.80    221444
           1       0.09      0.69      0.16     10509

    accuracy                           0.67    231953
   macro avg       0.53      0.68      0.48    231953
weighted avg       0.94      0.67      0.77    231953



С добавлением признаков метрика precision никак не изменилась, метрика recall изменилась незначительно

# Выводы 

1) Метод ближайших соседей, с одной стороны, удобнее в реализации благодаря быстрой работе с большим количеством данных
2) С другой стороны, его применение не очень удобно для сильного дисбаланса классов (тк нет возможности учесть это сразу при построении модели)
3) Сильных изменений в метрике precision для класса 1 (которая являлась целевой для увеличения) не произошло. Необходимо дальше искать оптимальный набор факторов либо применять иную модель