In [106]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import joblib
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utilities import utils
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import GridSearchCV, train_test_split

DATA_PATH = utils.get_datapath('data')
MODEL_PATH = utils.get_datapath('model')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [107]:
df = pd.read_csv(DATA_PATH / 'clean_lyrics_and_spotify.csv')

In [108]:
display(df.head())
display(df.shape)

Unnamed: 0.1,Unnamed: 0,song,lyrics,title,primary_artist,views,cleaned_lyrics,language,log_scaled_views,popular,popularity,popularity_std,spotify_popularity,spotify_popularity_three_class
0,0,Kendrick-lamar-swimming-pools-drank-lyrics,\n\n[Produced by T-Minus]\n\n[Intro]\nPour up ...,Swimming Pools (Drank),Kendrick-lamar,5589280.0,pour up drank head shot drank sit down drank ...,en,15.536361,1,2,2,78.0,2
1,1,Kendrick-lamar-money-trees-lyrics,\n\n[Produced by DJ Dahi]\n\n[Verse 1: Kendric...,Money Trees,Kendrick-lamar,4592003.0,uh me and my niggas tryna get it ya bish ya b...,en,15.339827,1,2,2,81.0,2
2,2,Kendrick-lamar-xxx-lyrics,"\n\n[Intro: Bēkon & Kid Capri]\nAmerica, God b...",XXX.,Kendrick-lamar,4651514.0,america god bless you if its good to you amer...,en,15.352703,1,2,2,69.0,2
3,3,A-ap-rocky-fuckin-problems-lyrics,"\n\n[Chorus: 2 Chainz, Drake & Both (A$AP Rock...",Fuckin’ Problems,A-ap-rocky,7378309.0,i love bad bitches thats my fuckin problem an...,en,15.814055,1,2,2,75.0,2
4,4,Kendrick-lamar-dna-lyrics,"\n\n[Verse 1]\nI got, I got, I got, I got—\nLo...",DNA.,Kendrick-lamar,5113687.0,i got i got i got i got loyalty got royalty i...,en,15.447431,1,2,2,80.0,2


(35908, 14)

In [113]:
X = df['cleaned_lyrics']
y = df['spotify_popularity_three_class']

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [115]:
ngrams_vectorizer = CountVectorizer(max_df=0.9, 
                         min_df=0.01, 
                         stop_words=list(ENGLISH_STOP_WORDS), 
                         ngram_range=(1,3)
                         )

log_reg = LogisticRegression(penalty='l2', max_iter=500)

In [116]:
pipe = Pipeline(steps=
                [
                    ('ngrams', ngrams_vectorizer),
                    ('log_reg', log_reg)
                ]
            )

In [72]:
param_grid = {
    'log_reg__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'log_reg__solver':['newton-cg']
}

In [73]:
search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=5)
search.fit(X_train, y_train)



In [75]:
search.best_params_

{'log_reg__C': 0.001, 'log_reg__solver': 'newton-cg'}

In [76]:
search.score(X_test, y_test)

0.4018595041322314

In [78]:
with open(MODEL_PATH / 'log_reg_spotify_popularity_pipeline.pkl', 'wb') as file:
    joblib.dump(search, file)


In [86]:
log_reg_coef = search.best_estimator_.named_steps['log_reg'].coef_

In [87]:
log_reg_coef.shape

(3, 3121)