In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

# Appends the entire brainstation_capstone project folder to the path.
# This allows us to make a relative import of our scripts in brainstation_capstone/scripts
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from scripts import utils
from scripts.vectorizer_pipeline import VectorizerPipeline

In [3]:
DATA_PATH = utils.get_datapath('data')

# **Transforming Lyrics** 

In [4]:
df = pd.read_csv(DATA_PATH / 'clean_lyrics.csv')

In [5]:
display(df.head())
df.shape

Unnamed: 0.1,Unnamed: 0,song,lyrics,views,cleaned_lyrics,log_scaled_views,popular,popularity_rating
0,0,Kendrick-lamar-swimming-pools-drank-lyrics,\n\n[Produced by T-Minus]\n\n[Intro]\nPour up ...,5589280.0,pour up drank head shot drank sit down drank ...,15.536361,1,2
1,1,Kendrick-lamar-money-trees-lyrics,\n\n[Produced by DJ Dahi]\n\n[Verse 1: Kendric...,4592003.0,uh me and my niggas tryna get it ya bish ya b...,15.339827,1,2
2,2,Kendrick-lamar-xxx-lyrics,"\n\n[Intro: Bēkon & Kid Capri]\nAmerica, God b...",4651514.0,america god bless you if its good to you amer...,15.352703,1,2
3,3,A-ap-rocky-fuckin-problems-lyrics,"\n\n[Chorus: 2 Chainz, Drake & Both (A$AP Rock...",7378309.0,i love bad bitches thats my fuckin problem an...,15.814055,1,2
4,4,Kendrick-lamar-dna-lyrics,"\n\n[Verse 1]\nI got, I got, I got, I got—\nLo...",5113687.0,i got i got i got i got loyalty got royalty i...,15.447431,1,2


(37905, 8)

In [8]:
from sklearn.feature_extraction import text 

stop_words = list(text.ENGLISH_STOP_WORDS.union(['im']))

# Transforming Lyrics for Regression

In [6]:

y_regression = df.log_scaled_views

y_regression.shape

(37905,)

In [8]:
for vectorizer_name in [
    'bag_of_words_regression', 'tf_idf_regression', '2_grams_regression', '3_grams_regression'
    ]:
    X = df.cleaned_lyrics
    
    if vectorizer_name == 'bag_of_words_regression':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words)
    elif vectorizer_name == 'tf_idf_regression':
        vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words)
    elif vectorizer_name == '2_grams_regression':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, ngram_range=(2,2))
    elif vectorizer_name == '3_grams_regression':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.001, stop_words=stop_words, ngram_range=(3,3))
    
    VectorizerPipeline(
        vectorizer_name, vectorizer, X, y_regression
    ).run_vectorizer_pipeline()

Train shape: (30324, 2223)             
Test shape: (7581, 2223)
Vectorizer dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/bag_of_words_regression/bag_of_words_regression.pkl
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/bag_of_words_regression/data.pkl as a dictionary.
Train shape: (30324, 2230)             
Test shape: (7581, 2230)
Vectorizer dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/tf_idf_regression/tf_idf_regression.pkl
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/tf_idf_regression/data.pkl as a dictionary.
Train shape: (30324, 408)             
Test shape: (7581, 408)
Vectorizer dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/2_grams_regression/2_grams_regression.pkl
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/2_grams_regression/data.pkl as a dictionary.
Train shape:

# Transforming Lyrics for Classification

In [6]:
y_popular = df.popular
y_popularity = df.popularity_rating

y_popular.shape, y_popularity.shape

((37905,), (37905,))

In [10]:
for vectorizer_name in [
    'bag_of_words_two_class', 'tf_idf_two_class', '2_grams_two_class', '3_grams_two_class'
    ]:
    X = df.cleaned_lyrics
    
    if vectorizer_name == 'bag_of_words_two_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words)
    elif vectorizer_name == 'tf_idf_two_class':
        vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words)
    elif vectorizer_name == '2_grams_two_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, ngram_range=(2,2))
    elif vectorizer_name == '3_grams_two_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.001, stop_words=stop_words, ngram_range=(3,3))
    
    VectorizerPipeline(
        vectorizer_name, vectorizer, X, y_popular
    ).run_vectorizer_pipeline()

Train shape: (30324, 2211)             
Test shape: (7581, 2211)
Vectorizer dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/bag_of_words_two_class/bag_of_words_two_class.pkl
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/bag_of_words_two_class/data.pkl as a dictionary.
Train shape: (30324, 2212)             
Test shape: (7581, 2212)
Vectorizer dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/tf_idf_two_class/tf_idf_two_class.pkl
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/tf_idf_two_class/data.pkl as a dictionary.
Train shape: (30324, 2211)             
Test shape: (7581, 2211)
Vectorizer dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/2_grams_two_class/2_grams_two_class.pkl
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/2_grams_two_class/data.pkl as a dictionary.
Train shape: (30324

In [11]:
for vectorizer_name in [
    'bag_of_words_three_class', 'tf_idf_three_class', '2_grams_three_class', '3_grams_three_class'
    ]:
    X = df.cleaned_lyrics
    
    if vectorizer_name == 'bag_of_words_three_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words)
    elif vectorizer_name == 'tf_idf_three_class':
        vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words)
    elif vectorizer_name == '2_grams_three_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, ngram_range=(2,2))
    elif vectorizer_name == '3_grams_three_class':
        vectorizer = CountVectorizer(max_df=0.9, min_df=0.001, stop_words=stop_words, ngram_range=(3,3))
    
    VectorizerPipeline(
        vectorizer_name, vectorizer, X, y_popularity
    ).run_vectorizer_pipeline()

Train shape: (30324, 2216)             
Test shape: (7581, 2216)
Vectorizer dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/bag_of_words_three_class/bag_of_words_three_class.pkl
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/bag_of_words_three_class/data.pkl as a dictionary.
Train shape: (30324, 2219)             
Test shape: (7581, 2219)
Vectorizer dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/tf_idf_three_class/tf_idf_three_class.pkl
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/tf_idf_three_class/data.pkl as a dictionary.
Train shape: (30324, 2225)             
Test shape: (7581, 2225)
Vectorizer dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/2_grams_three_class/2_grams_three_class.pkl
Transformed train test split dumped at /home/jng/projects/brainstation_capstone/vectorizer_data/2_grams_three_class/data.pkl as a dictionary.
T