In [1]:
import mlflow.sklearn
from mlflow import log_param
from mlflow.tracking import MlflowClient

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report, f1_score, roc_auc_score

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

from lightgbm import LGBMClassifier

In [2]:
dataset = pd.read_csv('data/dataset_5818_artscore.csv')

  and should_run_async(code)


In [4]:
dataset

Unnamed: 0,name,album,artist,release_date,popularity,duration_ms,key,mode,time_signature,acousticness,...,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,year,target,artist_score
0,Corner Man - Explicit Album Version,Last 2 Walk,Three 6 Mafia,2008-06-21,20,187480,2,1,4,0.080300,...,0.783,0.000000,0.0786,-5.480,0.0720,0.274,157.057,2008,0.0,4.0
1,Old Judge Jones,Say No More,Les Dudek,1977-01-01,29,278800,8,1,4,0.094400,...,0.647,0.004810,0.2470,-9.260,0.0343,0.652,135.866,1977,0.0,0.0
2,Endless Fantasy,Endless Fantasy,Anamanaguchi,2013-05-14,43,357785,8,1,3,0.000315,...,0.731,0.025200,0.2210,-4.027,0.1030,0.293,126.692,2013,0.0,0.0
3,Hope Has a Place,The Memory of Trees,Enya,1995-11-20,35,288640,11,1,4,0.954000,...,0.127,0.092700,0.1090,-18.247,0.0383,0.071,70.883,1995,0.0,2.0
4,Put the Needle on It - Radio Version,Neon Nights (Deluxe Version),Dannii Minogue,2003-01-01,37,204000,6,0,4,0.014100,...,0.618,0.035800,0.0970,-8.194,0.0651,0.776,120.021,2003,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208821,What You Get Is What You See - Live,Tina Live in Europe,Tina Turner,1988-03-16,32,334040,5,1,4,0.006120,...,0.906,0.001860,0.9880,-10.985,0.0610,0.676,168.447,1988,1.0,7.0
208822,What You Waiting For?,Love Angel Music Baby (Deluxe Version),Gwen Stefani,2004-11-23,54,221227,5,1,4,0.050900,...,0.948,0.000008,0.3840,-2.557,0.0628,0.731,136.027,2004,1.0,0.0
208823,Say What You Want,The Greatest Hits,Texas,2000-10-23,44,230547,4,1,4,0.176000,...,0.824,0.000000,0.1150,-6.015,0.0753,0.777,95.893,2000,1.0,1.0
208824,What's a Guy Gotta Do,What's a Guy Gotta Do,John McNicholl,2005-09-02,17,147987,1,1,4,0.592000,...,0.778,0.000000,0.0897,-7.352,0.0318,0.968,169.910,2005,1.0,0.0


In [5]:
data_2018 = dataset[dataset.year==2018]

  and should_run_async(code)


In [6]:
data_new = dataset.drop(['name', 'album', 'artist', 'popularity', 'release_date'], axis=1)

  and should_run_async(code)


In [7]:
to_encode=['year', 'key', 'mode']
to_bin=['duration_ms']
num_cols = ['time_signature', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence', 'tempo', 'artist_score']

In [8]:
transformer = ColumnTransformer([('scaler', StandardScaler(), num_cols),
                                 ("discretizer", KBinsDiscretizer(), to_bin),
                                 ("encoder", OneHotEncoder(drop='first'), to_encode)],
                                n_jobs=-1, verbose=1)

In [9]:
X = data_new.drop('target', axis=1)
y = data_new.target

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

In [11]:
X_train = transformer.fit_transform(X_train)

  and should_run_async(code)


In [12]:
X_test = transformer.transform(X_test)

  and should_run_async(code)


In [47]:
other_cols = [col for col in num_cols if col not in to_bin+to_encode]
bin_edges = transformer.transformers_[1][1].bin_edges_[0]
onehot_columns = transformer.transformers_[2][1].get_feature_names(to_encode)

In [48]:
features_transformed = other_cols+\
[f'Duration:[{bin_edges[i]}:{bin_edges[i+1]}]' for i in range(len(bin_edges)-1)]+\
list(onehot_columns)

In [104]:
def create_experiment(experiment_name) -> int:
    """
    Creates an mlflow experiment
    :param experiment_name: str. The name of the experiment to be set in MLFlow
    :return: the id of the experiment created if it doesn't exist, or the id of the existing experiment if it is already
    :return: the id of the experiment created if it doesn't exist, or the id of the existing experiment if it is already
    created
    """
    client = MlflowClient()
    experiments = client.list_experiments()
    experiment_names = set(map(lambda e: e.name, experiments))
    if experiment_name in experiment_names:
        print(f'Experiment {experiment_name} already created.')
        return list(filter(lambda e: e.name == experiment_name, experiments))[0].experiment_id
    else:
        return mlflow.create_experiment(name=experiment_name)

def eval_metrics(actual, pred):
    f1 = f1_score(actual, pred, pos_label=1)
    roc_auc = roc_auc_score(actual, pred)
    return f1, roc_auc

In [111]:
for sampling_strat in np.arange(0.2, 1, 0.1):

    experiment_id = create_experiment(experiment_name='SMOTE LGBM')

    with mlflow.start_run(experiment_id=experiment_id, nested=False):
        mlflow.set_tracking_uri("http://127.0.0.1:5000")

        mlflow.lightgbm.autolog()

        over_sampler = SMOTE(sampling_strategy=sampling_strat)
        under_sampler = EditedNearestNeighbours()
        smote = SMOTEENN(smote=over_sampler, enn=under_sampler,random_state=42)

        X_train_sampled, y_train_sampled = smote.fit_sample(X_train, y_train)

        #Train a machine learning model of your choice with scikit learn
        lgbm = LGBMClassifier(max_depth=3, num_leaves=5, n_estimators=300, random_state=42)
        lgbm.fit(X_train_sampled, y_train_sampled)

        #Make prediction
        y_pred_lgbm = lgbm.predict(X_test)
        y_pred_train_lgbm = lgbm.predict(X_train)

        #Look at some metrics like the rmse
        f1_test, roc_auc_test = eval_metrics(y_test, y_pred_lgbm)
        f1_train, roc_auc_train = eval_metrics(y_train, y_pred_train_lgbm)

        #Now your can log the params and metrics of your model
        mlflow.log_param('max_depth', lgbm.max_depth)
        mlflow.log_param('num_leaves', lgbm.num_leaves)
        mlflow.log_param('n_estimators', lgbm.n_estimators)
        mlflow.log_metric('F1_test', f1_test)
        mlflow.log_metric('F1_train', f1_train)
        mlflow.log_metric('ROC_AUC_test', roc_auc_test)
        mlflow.log_metric('ROC_AUC_train', roc_auc_train)
        mlflow.log_metric('Resampling_strat', sampling_strat)

        #Log your scikit learn model
        #mlflow.lightgbm.log_model(lgbm, 'lgbm')

  and should_run_async(code)


Experiment SMOTE LGBM already created.
Experiment SMOTE LGBM already created.
Experiment SMOTE LGBM already created.
Experiment SMOTE LGBM already created.
Experiment SMOTE LGBM already created.
Experiment SMOTE LGBM already created.
Experiment SMOTE LGBM already created.
Experiment SMOTE LGBM already created.


In [51]:
lgbm = LGBMClassifier(max_depth=3, num_leaves=5, n_estimators=300, random_state=42)

In [52]:
lgbm.fit(X_train_sampled, y_train_sampled)

LGBMClassifier(max_depth=3, n_estimators=300, num_leaves=5, random_state=42)

In [53]:
y_pred_lgbm = lgbm.predict(X_test)
y_pred_train_lgbm = lgbm.predict(X_train)

In [54]:
print(classification_report(y_test, y_pred_lgbm))

              precision    recall  f1-score   support

         0.0       0.93      0.91      0.92       634
         1.0       0.49      0.54      0.52       101

    accuracy                           0.86       735
   macro avg       0.71      0.73      0.72       735
weighted avg       0.87      0.86      0.86       735



In [55]:
print(classification_report(y_train, y_pred_train_lgbm))

              precision    recall  f1-score   support

         0.0       0.95      0.92      0.94      2538
         1.0       0.60      0.70      0.64       402

    accuracy                           0.89      2940
   macro avg       0.77      0.81      0.79      2940
weighted avg       0.90      0.89      0.90      2940



In [56]:
from sklearn.model_selection import GridSearchCV

In [83]:
param_grid_lgbm = {'num_leaves':[15, 20, 25],
                   'max_depth':[3, 5, 7],
                   'n_estimators':[100, 200, 300]}

In [84]:
grid_lgbm = GridSearchCV(lgbm, param_grid=param_grid_lgbm, n_jobs=-1, scoring='f1_macro')

In [85]:
grid_lgbm.fit(X_train_sampled, y_train_sampled)

GridSearchCV(estimator=LGBMClassifier(max_depth=3, n_estimators=300,
                                      num_leaves=5, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [3, 5, 7],
                         'n_estimators': [100, 200, 300],
                         'num_leaves': [15, 20, 25]},
             scoring='f1_macro')

In [86]:
grid_lgbm.best_params_

{'max_depth': 5, 'n_estimators': 200, 'num_leaves': 15}

In [87]:
best_lgbm = grid_lgbm.best_estimator_

In [88]:
y_pred_lgbm = best_lgbm.predict(X_test)
y_pred_train_lgbm = best_lgbm.predict(X_train)

In [89]:
print(classification_report(y_test, y_pred_lgbm))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.91       634
         1.0       0.45      0.49      0.47       101

    accuracy                           0.85       735
   macro avg       0.69      0.70      0.69       735
weighted avg       0.85      0.85      0.85       735



In [90]:
print(classification_report(y_train, y_pred_train_lgbm))

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96      2538
         1.0       0.69      0.93      0.79       402

    accuracy                           0.93      2940
   macro avg       0.84      0.93      0.88      2940
weighted avg       0.95      0.93      0.94      2940



In [64]:
from sklearn.linear_model import LogisticRegression

In [405]:
logr = LogisticRegression(random_state=42)

In [406]:
logr.fit(X_train_sampled, y_train_sampled)

LogisticRegression(random_state=42)

In [407]:
y_pred = logr.predict(X_test)

In [408]:
y_pred_train = logr.predict(X_train)

In [409]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.88      0.90       634
         1.0       0.43      0.54      0.48       101

    accuracy                           0.84       735
   macro avg       0.68      0.71      0.69       735
weighted avg       0.86      0.84      0.84       735



In [410]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

         0.0       0.92      0.88      0.90      2538
         1.0       0.41      0.53      0.46       402

    accuracy                           0.83      2940
   macro avg       0.67      0.70      0.68      2940
weighted avg       0.85      0.83      0.84      2940



In [433]:
param_grid_logr = {'solver':['newton-cg', 'lbfgs', 'saga'],
                  'penalty':['l1', 'l2', 'elasticnet']}

In [434]:
grid_logr = GridSearchCV(logr, param_grid=param_grid_logr, n_jobs=-1, scoring='f1_macro')

In [435]:
grid_logr.fit(X_train_sampled, y_train_sampled)

GridSearchCV(estimator=LogisticRegression(penalty='l1', random_state=42,
                                          solver='saga'),
             n_jobs=-1,
             param_grid={'penalty': ['l1', 'l2', 'elasticnet'],
                         'solver': ['newton-cg', 'lbfgs', 'saga']},
             scoring='f1_macro')

In [436]:
grid_logr.best_params_

{'penalty': 'l1', 'solver': 'saga'}

In [437]:
logr = grid_logr.best_estimator_

In [416]:
param_grid_logr = {'C':[0.1, 1, 10, 100, 1000]}

In [417]:
grid_logr = GridSearchCV(logr, param_grid=param_grid_logr, n_jobs=-1, scoring='f1_macro')

In [418]:
grid_logr.fit(X_train_sampled, y_train_sampled)

GridSearchCV(estimator=LogisticRegression(penalty='l1', random_state=42,
                                          solver='saga'),
             n_jobs=-1, param_grid={'C': [0.1, 1, 10, 100, 1000]},
             scoring='f1_macro')

In [419]:
grid_logr.best_params_

{'C': 1}

In [438]:
param_grid_logr = {'C':np.random.normal(1, 1, 1000)}

In [439]:
grid_logr = GridSearchCV(logr, param_grid=param_grid_logr, n_jobs=-1, scoring='f1_macro')

In [440]:
grid_logr.fit(X_train_sampled, y_train_sampled)

GridSearchCV(estimator=LogisticRegression(penalty='l1', random_state=42,
                                          solver='saga'),
             n_jobs=-1,
             param_grid={'C': array([ 0.50200548,  0.53360547,  2.19773783,  1.66861348, -0.19603419,
        0.49594109,  1.18528643,  0.48553828,  0.12493453,  1.34989718,
        0.5684225 , -0.01777652, -0.45510348,  2.22695297,  0.12067776,
       -0.39462804,  0.37618992,  0.68291755,  1.15948098,  1.05958253,
        2.229464...
        1.72954935,  0.50063788,  1.99312224,  2.31901386,  0.40722663,
        1.50092894, -0.27418224,  0.2434233 ,  1.93025053, -1.24982226,
        3.14545886,  0.41624565,  1.53417903,  0.83937352,  1.0319325 ,
        1.44699805,  0.10653761,  1.57656584,  2.16514603,  1.24394967,
        0.06180656,  2.82628549,  1.6065696 ,  0.22554011, -0.36251275,
        0.59940481,  0.74015441, -0.1319072 ,  1.07241783,  1.6233213 ])},
             scoring='f1_macro')

In [441]:
grid_logr.best_params_

{'C': 0.502005477703523}

In [442]:
best_logr = grid_logr.best_estimator_

In [443]:
y_pred = best_logr.predict(X_test)

In [444]:
y_pred_train = best_logr.predict(X_train)

In [445]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.88      0.90       634
         1.0       0.41      0.50      0.45       101

    accuracy                           0.83       735
   macro avg       0.66      0.69      0.68       735
weighted avg       0.85      0.83      0.84       735



In [446]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

         0.0       0.92      0.88      0.90      2538
         1.0       0.41      0.52      0.46       402

    accuracy                           0.83      2940
   macro avg       0.67      0.70      0.68      2940
weighted avg       0.85      0.83      0.84      2940



In [455]:
best_logr.coef_

array([[ 0.24371888, -0.1822859 ,  0.56499807, -0.06267258, -0.32028399,
        -0.07564234,  0.33238445,  0.15013573, -0.53292571,  0.18086581,
         1.89883183, -0.09823083,  0.        ,  0.        ,  0.02084929,
        -0.33916473,  0.04574187, -0.25507311,  0.        , -0.33791153,
        -0.11667419,  0.55564621, -0.71514001, -0.32757772, -0.2425911 ,
         0.54288638,  0.12345068, -0.03826272]])

In [457]:
features_transformed

['time_signature',
 'acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'valence',
 'tempo',
 'artist_score',
 'Duration:[33881.0:163468.4]',
 'Duration:[163468.4:191996.8]',
 'Duration:[191996.8:213317.4]',
 'Duration:[213317.4:241518.2]',
 'Duration:[241518.2:4500062.0]',
 'key_1',
 'key_2',
 'key_3',
 'key_4',
 'key_5',
 'key_6',
 'key_7',
 'key_8',
 'key_9',
 'key_10',
 'key_11',
 'mode_1']