In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import joblib
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utilities import utils
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import GridSearchCV, train_test_split

DATA_PATH = utils.get_datapath('data')
MODEL_PATH = utils.get_datapath('model')

%load_ext autoreload
%autoreload 2

In [4]:
df = pd.read_csv(DATA_PATH / 'clean_lyrics_spotify_genres_ada.csv')

In [5]:
display(df.head())
display(df.shape)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,song,lyrics,title,primary_artist,views,cleaned_lyrics,language,log_scaled_views,popular,popularity,popularity_std,spotify_popularity,spotify_popularity_three_class,n_tokens,ada_embeddings,genre,spotify_popularity_two_class
0,0,0,0,Kendrick-lamar-swimming-pools-drank-lyrics,\n\n[Produced by T-Minus]\n\n[Intro]\nPour up ...,Swimming Pools (Drank),Kendrick-lamar,5589280.0,pour up drank head shot drank sit down drank ...,en,15.536361,1,2,2,78.0,2,881,"[0.011653340421617031, -0.0033766645938158035,...",hip hop,1
1,1,1,1,Kendrick-lamar-money-trees-lyrics,\n\n[Produced by DJ Dahi]\n\n[Verse 1: Kendric...,Money Trees,Kendrick-lamar,4592003.0,uh me and my niggas tryna get it ya bish ya b...,en,15.339827,1,2,2,81.0,2,1215,"[0.0013736916007474065, -0.00975166354328394, ...",hip hop,1
2,2,2,2,Kendrick-lamar-xxx-lyrics,"\n\n[Intro: Bēkon & Kid Capri]\nAmerica, God b...",XXX.,Kendrick-lamar,4651514.0,america god bless you if its good to you amer...,en,15.352703,1,2,2,69.0,2,686,"[-0.014546433463692665, -0.006841110065579414,...",hip hop,1
3,3,3,3,A-ap-rocky-fuckin-problems-lyrics,"\n\n[Chorus: 2 Chainz, Drake & Both (A$AP Rock...",Fuckin’ Problems,A-ap-rocky,7378309.0,i love bad bitches thats my fuckin problem an...,en,15.814055,1,2,2,75.0,2,894,"[-0.018262486904859543, 0.006630411371588707, ...",hip hop,1
4,4,4,4,Kendrick-lamar-dna-lyrics,"\n\n[Verse 1]\nI got, I got, I got, I got—\nLo...",DNA.,Kendrick-lamar,5113687.0,i got i got i got i got loyalty got royalty i...,en,15.447431,1,2,2,80.0,2,759,"[-0.023312833160161972, -0.012944793328642845,...",hip hop,1


(35901, 20)

In [82]:
X = df['cleaned_lyrics']
y = df['spotify_popularity_three_class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=33)
ngrams_vectorizer = CountVectorizer(max_df=0.9, 
                         min_df=0.01, 
                         stop_words=list(ENGLISH_STOP_WORDS), 
                         ngram_range=(1,3)
                         )

log_reg = LogisticRegression(penalty='l2', max_iter=500)
pipe = Pipeline(steps=
                [
                    ('ngrams', ngrams_vectorizer),
                    ('log_reg', log_reg)
                ]
            )
param_grid = {
    'log_reg__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'log_reg__solver':['saga']
}
search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=5, verbose=3)
search.fit(X_train, y_train)
search.score(X_test, y_test)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.419 total time=  19.1s
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.409 total time=  19.1s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.405 total time=  16.4s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.415 total time=  18.5s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.420 total time=  18.7s




[CV 1/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.412 total time=  31.1s




[CV 2/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.396 total time=  30.2s




[CV 3/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.413 total time=  30.9s
[CV 4/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.393 total time=  26.6s




[CV 5/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.412 total time=  30.2s




[CV 1/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.405 total time=  30.2s




[CV 2/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.389 total time=  30.4s




[CV 3/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.404 total time=  31.0s




[CV 4/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.386 total time=  31.1s




[CV 5/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.408 total time=  31.3s




[CV 1/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.405 total time=  30.6s




[CV 2/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.388 total time=  30.8s




[CV 3/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.403 total time=  30.4s




[CV 4/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.383 total time=  30.6s




[CV 5/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.407 total time=  30.5s




[CV 1/5] END log_reg__C=1, log_reg__solver=saga;, score=0.406 total time=  30.6s




[CV 2/5] END log_reg__C=1, log_reg__solver=saga;, score=0.389 total time=  30.6s




[CV 3/5] END log_reg__C=1, log_reg__solver=saga;, score=0.402 total time=  31.0s




[CV 4/5] END log_reg__C=1, log_reg__solver=saga;, score=0.383 total time=  30.8s




[CV 5/5] END log_reg__C=1, log_reg__solver=saga;, score=0.407 total time=  31.3s




[CV 1/5] END log_reg__C=10, log_reg__solver=saga;, score=0.406 total time=  31.1s




[CV 2/5] END log_reg__C=10, log_reg__solver=saga;, score=0.389 total time=  30.1s




[CV 3/5] END log_reg__C=10, log_reg__solver=saga;, score=0.403 total time=  30.1s




[CV 4/5] END log_reg__C=10, log_reg__solver=saga;, score=0.383 total time=  29.5s




[CV 5/5] END log_reg__C=10, log_reg__solver=saga;, score=0.407 total time=  29.7s




[CV 1/5] END log_reg__C=100, log_reg__solver=saga;, score=0.406 total time=  29.7s




[CV 2/5] END log_reg__C=100, log_reg__solver=saga;, score=0.388 total time=  29.9s




[CV 3/5] END log_reg__C=100, log_reg__solver=saga;, score=0.402 total time=  29.6s




[CV 4/5] END log_reg__C=100, log_reg__solver=saga;, score=0.383 total time=  29.5s




[CV 5/5] END log_reg__C=100, log_reg__solver=saga;, score=0.407 total time=  30.3s


0.42041498398551735

In [84]:
with open(MODEL_PATH / 'log_reg_spotify_popularity_pipeline.pkl', 'wb') as file:
    joblib.dump(search, file)


In [86]:
log_reg_coef = search.best_estimator_.named_steps['log_reg'].coef_

# Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
X = df['cleaned_lyrics']
y = df['spotify_popularity_three_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=33)

ngrams_vectorizer = CountVectorizer(max_df=0.9, 
                         min_df=0.01, 
                         stop_words=list(ENGLISH_STOP_WORDS), 
                         ngram_range=(1,3)
                         )

random_forest = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=-1)

pipe = Pipeline(steps=
                [
                    ('ngrams', ngrams_vectorizer),
                    ('random_forest', random_forest)
                ]
            )

param_grid = {
    'random_forest__max_depth':np.arange(5,11),
}

search = GridSearchCV(pipe, param_grid, cv=5, verbose=2)
search.fit(X_train, y_train)
search.score(X_test, y_test)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END .........................random_forest__max_depth=5; total time=  18.1s
[CV] END .........................random_forest__max_depth=5; total time=  15.4s
[CV] END .........................random_forest__max_depth=5; total time=  15.5s
[CV] END .........................random_forest__max_depth=5; total time=  15.4s
[CV] END .........................random_forest__max_depth=5; total time=  15.4s
[CV] END .........................random_forest__max_depth=6; total time=  15.8s
[CV] END .........................random_forest__max_depth=6; total time=  15.8s
[CV] END .........................random_forest__max_depth=6; total time=  15.5s
[CV] END .........................random_forest__max_depth=6; total time=  15.5s
[CV] END .........................random_forest__max_depth=6; total time=  16.3s
[CV] END .........................random_forest__max_depth=7; total time=  15.9s
[CV] END .........................random_forest__

0.40350877192982454

In [15]:
with open(MODEL_PATH / 'random_forest_spotify_popularity_pipeline.pkl', 'wb') as file:
    joblib.dump(search, file)

# Classification With OpenAI Ada Embeddings

In [3]:
# Get the array of the ada embeddings. 
ada_embedding_array = utils.get_ada_embeddings(df, 'ada_embeddings')

NameError: name 'df' is not defined

In [49]:
ada_embedding_array.shape

(35901, 1536)

In [38]:
X = ada_embedding_array
y = df['spotify_popularity_three_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=33)

In [42]:

log_reg = LogisticRegression(penalty='l2', max_iter=500)
pipe = Pipeline(steps=
                [
                    ('log_reg', log_reg)
                ]
            )
param_grid = {
    'log_reg__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'log_reg__solver':['saga','newton-cg']
}
search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=5, verbose=3)
search.fit(X_train, y_train)
search.score(X_test, y_test)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.362 total time=   8.7s
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.366 total time=   9.2s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.360 total time=   9.6s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.365 total time=  10.2s
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.366 total time=   0.9s
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.362 total time=   0.9s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.360 total time=   0.9s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.365 total time=   0.9s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.364 total time=   0.9s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.364 total time=  10.2s
[CV 1/5] END log_reg__C=0.001, log_reg__solver=saga;, 

0.43629021027712017

In [43]:
search.best_estimator_

In [50]:
with open(MODEL_PATH / 'log_reg_ada_spotify_three_class_pipeline.pkl', 'wb') as file:
    joblib.dump(search, file)

## Ada Embeddings and TruncatedSVD

In [56]:
from sklearn.decomposition import TruncatedSVD

In [57]:

log_reg = LogisticRegression(penalty='l2', max_iter=500)
svd = TruncatedSVD(n_components=10)

pipe = Pipeline(steps=
                [
                    ('svd', svd),
                    ('log_reg', log_reg)
                ]
            )
param_grid = {
    'svd__n_components':[10, 20],
    'log_reg__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'log_reg__solver':['saga', 'newton-cg']
}
search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=5, verbose=3)
search.fit(X_train, y_train)
search.score(X_test, y_test)

Fitting 5 folds for each of 28 candidates, totalling 140 fits
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=saga, svd__n_components=10;, score=0.360 total time=   2.0s
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=saga, svd__n_components=10;, score=0.365 total time=   2.0s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=saga, svd__n_components=10;, score=0.364 total time=   0.7s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=saga, svd__n_components=10;, score=0.358 total time=   0.8s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=saga, svd__n_components=10;, score=0.364 total time=   0.7s
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=saga, svd__n_components=20;, score=0.365 total time=   0.9s
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=saga, svd__n_components=20;, score=0.361 total time=   0.8s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=saga, svd__n_components=20;, score=0.359 total time=   0.7s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=saga, svd__n_components=20

0.42849185350229774

# Observations on hip hop songs.

In [74]:
hip_hop_df = df[
    df['genre'] == 'hip hop'
]

hip_hop_df.shape

(14476, 20)

In [75]:
hip_hop_ada_embeddings = utils.get_ada_embeddings(hip_hop_df, 'ada_embeddings')
hip_hop_ada_embeddings.shape

(14476, 1536)

In [76]:
X = hip_hop_ada_embeddings
y = hip_hop_df['spotify_popularity_three_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=33)

In [77]:
y.value_counts(normalize=True)

0    0.376002
1    0.325504
2    0.298494
Name: spotify_popularity_three_class, dtype: float64

In [78]:

log_reg = LogisticRegression(penalty='l2', max_iter=500)
pipe = Pipeline(steps=
                [
                    ('log_reg', log_reg)
                ]
            )
param_grid = {
    'log_reg__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'log_reg__solver':['saga','newton-cg']
}
search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=5, verbose=3)
search.fit(X_train, y_train)
search.score(X_test, y_test)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.376 total time=   3.8s
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.376 total time=   3.7s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.376 total time=   3.6s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.376 total time=   4.1s
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.376 total time=   0.4s
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.376 total time=   0.3s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.376 total time=   0.3s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.376 total time=   0.4s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.376 total time=   0.3s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.376 total time=   4.0s
[CV 1/5] END log_reg__C=0.001, log_reg__solver=saga;, 

0.41263812154696133

# Observations on Pop Songs

In [67]:
pop_df = df[
    df['genre'] == 'pop'
]

pop_df.shape

(6385, 20)

In [68]:
pop_ada_embeddings = utils.get_ada_embeddings(pop_df, 'ada_embeddings')
pop_ada_embeddings.shape

(6385, 1536)

In [69]:
X = pop_ada_embeddings
y = pop_df['spotify_popularity_three_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=33)

In [70]:
y.value_counts(normalize=True)

2    0.421926
1    0.311825
0    0.266249
Name: spotify_popularity_three_class, dtype: float64

In [71]:

log_reg = LogisticRegression(penalty='l2', max_iter=500)
pipe = Pipeline(steps=
                [
                    ('log_reg', log_reg)
                ]
            )
param_grid = {
    'log_reg__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'log_reg__solver':['saga','newton-cg']
}
search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=5, verbose=3)
search.fit(X_train, y_train)
search.score(X_test, y_test)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.422 total time=   1.4s[CV 1/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.422 total time=   1.4s

[CV 4/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.422 total time=   1.5s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.422 total time=   1.5s
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.422 total time=   0.2s
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.422 total time=   0.2s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.422 total time=   0.1s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.422 total time=   0.2s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.422 total time=   0.1s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.422 total time=   1.6s
[CV 1/5] END log_reg__C=0.001, log_reg__solver=saga;, 

0.4635865309318716

In [72]:
with open(MODEL_PATH / 'log_reg_ada_spotify_three_class_pop_pipeline.pkl', 'wb') as file:
    joblib.dump(search, file)

---

# Modeling Two Class Popularity

## Logistic Regression + N grams = 1, 2 and 3

In [6]:
X = df['cleaned_lyrics']
y = df['spotify_popularity_two_class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=33)
ngrams_vectorizer = CountVectorizer(max_df=0.9, 
                         min_df=0.01, 
                         stop_words=list(ENGLISH_STOP_WORDS), 
                         ngram_range=(1,3)
                         )

log_reg = LogisticRegression(penalty='l2', max_iter=500)
pipe = Pipeline(steps=
                [
                    ('ngrams', ngrams_vectorizer),
                    ('log_reg', log_reg)
                ]
            )
param_grid = {
    'log_reg__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'log_reg__solver':['saga']
}
search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=5, verbose=3)
search.fit(X_train, y_train)
search.score(X_test, y_test)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.582 total time=  19.8s
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.580 total time=  20.4s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.583 total time=  17.8s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.582 total time=  19.2s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.585 total time=  18.0s




[CV 1/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.571 total time=  25.4s




[CV 2/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.577 total time=  25.9s




[CV 3/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.580 total time=  26.3s
[CV 4/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.569 total time=  23.4s




[CV 5/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.574 total time=  26.3s




[CV 1/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.565 total time=  26.3s




[CV 2/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.571 total time=  27.0s




[CV 3/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.576 total time=  27.1s




[CV 4/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.554 total time=  26.0s




[CV 5/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.566 total time=  26.0s




[CV 1/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.565 total time=  26.0s




[CV 2/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.568 total time=  26.0s




[CV 3/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.578 total time=  26.2s




[CV 4/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.551 total time=  26.9s




[CV 5/5] END log_reg__C=0.1, log_reg__solver=saga;, score=0.565 total time=  26.9s




[CV 1/5] END log_reg__C=1, log_reg__solver=saga;, score=0.565 total time=  26.5s




[CV 2/5] END log_reg__C=1, log_reg__solver=saga;, score=0.568 total time=  26.3s




[CV 3/5] END log_reg__C=1, log_reg__solver=saga;, score=0.578 total time=  26.6s




[CV 4/5] END log_reg__C=1, log_reg__solver=saga;, score=0.551 total time=  26.9s




[CV 5/5] END log_reg__C=1, log_reg__solver=saga;, score=0.564 total time=  26.7s




[CV 1/5] END log_reg__C=10, log_reg__solver=saga;, score=0.565 total time=  26.7s




[CV 2/5] END log_reg__C=10, log_reg__solver=saga;, score=0.568 total time=  26.7s




[CV 3/5] END log_reg__C=10, log_reg__solver=saga;, score=0.578 total time=  26.7s




[CV 4/5] END log_reg__C=10, log_reg__solver=saga;, score=0.551 total time=  27.1s




[CV 5/5] END log_reg__C=10, log_reg__solver=saga;, score=0.564 total time=  26.4s




[CV 1/5] END log_reg__C=100, log_reg__solver=saga;, score=0.565 total time=  26.2s




[CV 2/5] END log_reg__C=100, log_reg__solver=saga;, score=0.568 total time=  25.7s




[CV 3/5] END log_reg__C=100, log_reg__solver=saga;, score=0.578 total time=  25.8s




[CV 4/5] END log_reg__C=100, log_reg__solver=saga;, score=0.551 total time=  25.5s




[CV 5/5] END log_reg__C=100, log_reg__solver=saga;, score=0.565 total time=  24.9s


0.5949032168221696

In [7]:
with open(MODEL_PATH / 'log_reg_ngrams_spotify_two_class_pipeline.pkl', 'wb') as file:
    joblib.dump(search, file)

## Exploring Word Importance in Predicting Popularity

In [14]:
words = search.best_estimator_.named_steps['ngrams'].get_feature_names_out()

In [19]:
words.shape

(2746,)

In [15]:
coefficients = search.best_estimator_.named_steps['log_reg'].coef_

In [22]:
interpret_df = pd.DataFrame(
    {
        'word': words,
        # Need to reshape the coefficients to one dimensional.
        'coefficients':coefficients.reshape(-1)
    }
)

### Top Words For Predicting a Popular Song

In [28]:
interpret_df.sort_values(by='coefficients', ascending=False).head(30)

Unnamed: 0,word,coefficients
96,away,0.022411
1946,remember,0.021972
750,feel,0.018455
2642,wish,0.018445
1715,ooh,0.015571
2735,youre,0.015424
2590,water,0.015026
730,falling,0.013532
1185,ive,0.013471
758,feels,0.013439


### Worst Words for Predicting a Popular Song

In [29]:
interpret_df.sort_values(by='coefficients', ascending=False).tail(30)

Unnamed: 0,word,coefficients
2369,talking,-0.010689
2618,whats,-0.01072
2658,word,-0.010759
631,dope,-0.010823
706,everyday,-0.010891
2313,streets,-0.010906
630,doors,-0.010987
2327,style,-0.011016
200,bitches,-0.011055
1740,paper,-0.011063


# Logisitic Regression N-grams = 1, 2 and 3 and NMF

In [35]:
from sklearn.decomposition import NMF

In [37]:
X = df['cleaned_lyrics']
y = df['spotify_popularity_two_class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=33)

ngrams_vectorizer = CountVectorizer(max_df=0.9, 
                         min_df=0.01, 
                         stop_words=list(ENGLISH_STOP_WORDS), 
                         ngram_range=(1,3)
                         )

nmf = NMF(n_components=10)
log_reg = LogisticRegression(penalty='l2', max_iter=500)

pipe = Pipeline(steps=
                [
                    ('ngrams', ngrams_vectorizer),
                    ('nmf', nmf),
                    ('log_reg', log_reg)
                ]
            )
param_grid = {
    'log_reg__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'log_reg__solver':['saga']
}
search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=5, verbose=3)
search.fit(X_train, y_train)
search.score(X_test, y_test)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.516 total time=  43.7s
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.516 total time=  43.7s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.516 total time=  19.2s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.515 total time=  19.3s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.516 total time=  18.7s
[CV 1/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.523 total time=  18.7s
[CV 2/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.525 total time=  18.0s
[CV 3/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.524 total time=  18.2s
[CV 4/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.522 total time=  17.2s
[CV 5/5] END log_reg__C=0.001, log_reg__solver=saga;, score=0.520 total time=  17.2s
[CV 1/5] END log_reg__C=0.01, log_reg__solver=saga;, score=0.555 total time=  17.0s
[

0.5663556607714803

## Ada Embeddings and Two Class

In [30]:
# Get the array of the ada embeddings. 
ada_embedding_array = utils.get_ada_embeddings(df, 'ada_embeddings')

In [31]:
ada_embedding_array.shape

(35901, 1536)

In [32]:
X = ada_embedding_array
y = df['spotify_popularity_two_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=33)

In [33]:

log_reg = LogisticRegression(penalty='l2', max_iter=500)
pipe = Pipeline(steps=
                [
                    ('log_reg', log_reg)
                ]
            )
param_grid = {
    'log_reg__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'log_reg__solver':['saga','newton-cg']
}
search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=5, verbose=3)
search.fit(X_train, y_train)
search.score(X_test, y_test)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.516 total time=   4.1s
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.516 total time=   4.4s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.516 total time=   3.9s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.516 total time=   4.1s
[CV 1/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.516 total time=   1.2s
[CV 2/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.516 total time=   1.1s
[CV 3/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.516 total time=   1.1s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=saga;, score=0.516 total time=   5.1s
[CV 4/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.516 total time=   1.2s
[CV 5/5] END log_reg__C=0.0001, log_reg__solver=newton-cg;, score=0.516 total time=   1.2s
[CV 1/5] END log_reg__C=0.001, log_reg__solver=saga;, 

0.5937891658543378