In [49]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


Najlepsze wyniki na zbiorach walidacyjnych dały:
* Random Forest,
* Bagging,
* Regresja Logistyczna 3 stopnia,
* XGBoost,
* Catboost

In [50]:
test_set = pd.read_csv('../test_set.csv')
test_set = test_set.rename(columns={"Unnamed: 0": "index"}).set_index("index")
test_set.drop(['track_id', 'track_album_id', 'playlist_id'], axis = 1, inplace=True)
train_set = pd.read_csv('../train_set.csv')
train_set = train_set.rename(columns={"Unnamed: 0": "index"}).set_index("index")
train_set.drop(['track_id', 'track_album_id', 'playlist_id'], axis = 1, inplace=True)
train_set = train_set[train_set['duration_ms'] >= 30000]

In [51]:
y_train, y_test = train_set['playlist_genre'], test_set['playlist_genre']
X_train, X_test = train_set.loc[:, ['track_popularity', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
          'liveness', 'valence', 'tempo']], test_set.loc[:, ['track_popularity', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
          'liveness', 'valence', 'tempo']]

In [52]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

## Regresja Logistyczna

In [53]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:
poly = PolynomialFeatures(3)
X_train_scaled_poly = poly.fit_transform(X_train_scaled)
X_test_scaled_poly = poly.transform(X_test_scaled)

In [55]:
log_reg = LogisticRegression(random_state=1, C = .1)
log_reg.fit(X_train_scaled_poly, y_train)



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
y_log_reg = log_reg.predict(X_test_scaled_poly)

In [57]:
np.mean(y_test == y_log_reg)

0.5191107050403533

## RandomForest

In [58]:
rf = RandomForestClassifier(random_state=1,
                           max_features='auto',
                           max_depth=None,
                           min_samples_leaf=10,
                           min_samples_split=2,
                           n_estimators=500)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [59]:
y_rf = rf.predict(X_test)

In [60]:
np.mean(y_test == y_rf)

0.5497182884117557

## Bagging

In [61]:
bag = BaggingClassifier(random_state=1, 
                       bootstrap=True,
                       bootstrap_features=False,
                       max_features=10,
                       max_samples=.25,
                       n_estimators=1000)
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=10, max_samples=0.25, n_estimators=1000,
                  n_jobs=None, oob_score=False, random_state=1, verbose=0,
                  warm_start=False)

In [62]:
y_bag = bag.predict(X_test)

In [63]:
np.mean(y_test == y_bag)

0.5509365006852444

## XGBoost

In [64]:
xgb = XGBClassifier(random_state=1,
                   booster='gbtree',
                   max_depth=6,
                   n_estimators=250)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=250, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [65]:
y_xgb= xgb.predict(X_test)

In [66]:
np.mean(y_test == y_xgb)

0.5516978833561749

## Catboost 

In [67]:
cat = CatBoostClassifier(random_state=1,
                        learning_rate=.1,
                        max_depth=9,
                        n_estimators=250)
cat.fit(X_train, y_train)

0:	learn: 1.7154656	total: 239ms	remaining: 59.6s
1:	learn: 1.6568377	total: 486ms	remaining: 1m
2:	learn: 1.6074909	total: 738ms	remaining: 1m
3:	learn: 1.5657359	total: 974ms	remaining: 59.9s
4:	learn: 1.5285490	total: 1.22s	remaining: 59.6s
5:	learn: 1.4980072	total: 1.45s	remaining: 58.9s
6:	learn: 1.4733143	total: 1.68s	remaining: 58.3s
7:	learn: 1.4482319	total: 1.94s	remaining: 58.5s
8:	learn: 1.4254276	total: 2.17s	remaining: 58.1s
9:	learn: 1.4074307	total: 2.42s	remaining: 58s
10:	learn: 1.3897752	total: 2.65s	remaining: 57.6s
11:	learn: 1.3736768	total: 2.87s	remaining: 57s
12:	learn: 1.3593591	total: 3.1s	remaining: 56.5s
13:	learn: 1.3460419	total: 3.34s	remaining: 56.2s
14:	learn: 1.3317671	total: 3.58s	remaining: 56.1s
15:	learn: 1.3200614	total: 3.81s	remaining: 55.7s
16:	learn: 1.3098434	total: 4.04s	remaining: 55.4s
17:	learn: 1.2991941	total: 4.29s	remaining: 55.4s
18:	learn: 1.2893221	total: 4.54s	remaining: 55.2s
19:	learn: 1.2806877	total: 4.77s	remaining: 54.9s
2

161:	learn: 0.9251463	total: 36.9s	remaining: 20s
162:	learn: 0.9238460	total: 37.1s	remaining: 19.8s
163:	learn: 0.9222576	total: 37.3s	remaining: 19.6s
164:	learn: 0.9214694	total: 37.6s	remaining: 19.3s
165:	learn: 0.9202555	total: 37.7s	remaining: 19.1s
166:	learn: 0.9184739	total: 37.9s	remaining: 18.8s
167:	learn: 0.9167344	total: 38s	remaining: 18.6s
168:	learn: 0.9147980	total: 38.2s	remaining: 18.3s
169:	learn: 0.9131168	total: 38.5s	remaining: 18.1s
170:	learn: 0.9122598	total: 38.7s	remaining: 17.9s
171:	learn: 0.9109879	total: 38.9s	remaining: 17.6s
172:	learn: 0.9101002	total: 39.1s	remaining: 17.4s
173:	learn: 0.9091679	total: 39.4s	remaining: 17.2s
174:	learn: 0.9078162	total: 39.6s	remaining: 17s
175:	learn: 0.9066690	total: 39.8s	remaining: 16.7s
176:	learn: 0.9057865	total: 40s	remaining: 16.5s
177:	learn: 0.9045526	total: 40.3s	remaining: 16.3s
178:	learn: 0.9036104	total: 40.5s	remaining: 16.1s
179:	learn: 0.9020176	total: 40.7s	remaining: 15.8s
180:	learn: 0.900258

<catboost.core.CatBoostClassifier at 0x24d1b707848>

In [68]:
y_cat = cat.predict(X_test)

In [69]:
np.mean(y_test == y_cat)

0.1673708557832237

Najlepsze wyniki daje XGBoost (ale bardzo nieznacznie). Nie wiem czemu CatBoost daje takie słabe rezultaty.