# SPICED Academy // WEEK 04 // Lyrics Classifier

## 0. Set Up

In [106]:
# my scraping module
import lyric_scrape as ls

# essentials
import pandas as pd
import numpy as np

# train-test split
from sklearn.model_selection import train_test_split

# feature engineering
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

# modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brunnogorgulhosoares/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1. Define Business Goal

Predict the composing artist for a given set of lyrics.

## 2. Get Data

### 2.1 Select Artists

In [107]:
artists = ['Elton John', 'Dua Lipa', 'Red Hot Chili Peppers']

### 2.2 Make sure all artist URLs lead to the artist's page, not to a search results page.

In [108]:
for artist in artists:
    print(ls.artist_url(artist))

https://www.lyrics.com/artist/elton+john
https://www.lyrics.com/artist/dua+lipa
https://www.lyrics.com/artist/red+hot+chili+peppers


In [109]:
ds = np.empty((0,4))

for artist in artists:
    links = ls.scrape_artist(artist)
    result = ls.scrape_lyrics(links, 50) # select number of songs to scrape
    ds = np.concatenate([ds,result])

In [110]:
df = pd.DataFrame(ds)
df.columns = ['artist_name','song_name','song_link','song_lyrics']
y = df['artist_name']
X = df['song_lyrics']

## 3. Train-Test Split

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## 4. Explore Data

In [112]:
y_train.value_counts()

Red Hot Chili Peppers    38
Elton John               37
Dua Lipa                 30
Name: artist_name, dtype: int64

## 5. Feature Engineering

In [113]:
eng_stopwords = stopwords.words('english')

In [114]:
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()
tf2 = TfidfVectorizer(stop_words=eng_stopwords)

In [115]:
def toke_lemma(X):
    X_clean = []
    for lyric in X_train:
        tokens = tokenizer.tokenize(text=lyric)
        lyric_clean = " ".join(lemmatizer.lemmatize(token) for token in tokens)
        X_clean.append(lyric_clean)
    return X_clean

In [116]:
def to_dense(X):
    return X.todense()

In [117]:
# tf2.fit(X_train)

## 6. Model Training

In [118]:
pipe_logi = Pipeline([
    ('toke_lemma', FunctionTransformer(toke_lemma)), # fit & transform are applied
    ('tfidf_vectorizer', TfidfVectorizer(stop_words=eng_stopwords)), # fit & transform are applied
    ('modeling', LogisticRegression(random_state=101)) # only fit is applied
])

In [119]:
pipe_logi.fit(X_train,y_train)

Pipeline(steps=[('toke_lemma',
                 FunctionTransformer(func=<function toke_lemma at 0x7fd188268dd0>)),
                ('tfidf_vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('modeling', LogisticRegression(random_state=101))])

In [120]:
pipe_rf = Pipeline([
    ('toke_lemma', FunctionTransformer(toke_lemma)), # fit & transform are applied
    ('tfidf_vectorizer', TfidfVectorizer(stop_words=eng_stopwords)), # fit & transform are applied
    ('modeling', RandomForestClassifier(random_state=101)) # only fit is applied
])

In [121]:
pipe_rf.fit(X_train,y_train)

Pipeline(steps=[('toke_lemma',
                 FunctionTransformer(func=<function toke_lemma at 0x7fd188268dd0>)),
                ('tfidf_vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('modeling', RandomForestClassifier(random_state=101))])

In [122]:
pipe_nb = Pipeline([
    ('toke_lemma', FunctionTransformer(toke_lemma)), # fit & transform are applied
    ('tfidf_vectorizer', TfidfVectorizer(stop_words=eng_stopwords)), # fit & transform are applied
    ('to_dense', FunctionTransformer(to_dense)),
    ('modeling', MultinomialNB()) # only fit is applied
])

In [123]:
pipe_nb.fit(X_train,y_train)

Pipeline(steps=[('toke_lemma',
                 FunctionTransformer(func=<function toke_lemma at 0x7fd188268dd0>)),
                ('tfidf_vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('to_dense',
                 FunctionTransformer(func=<function to_dense at 0x7fd186f58cb0>)),
                ('modeling', MultinomialNB())])

In [124]:
print(
    f'Logistic Reg Precision: {pipe_logi.score(X_train,y_train)}',
    f'\nRandomForest Class Precision: {pipe_rf.score(X_train,y_train)}',
    f'\nNaive Bayes Precision: {pipe_nb.score(X_train,y_train)}'
)

Logistic Reg Precision: 1.0 
RandomForest Class Precision: 1.0 
Naive Bayes Precision: 1.0


In [125]:
# pipe_nb.predict_proba(X_train)
# pipe_nb.classes_

In [126]:
# pipe_rf.predict_proba(X_train)

## 7. Optimize Hyperparameters & Cross-Validation

## 8. Calculate Test Score

In [127]:
# pipe.score(X_test,y_test)

In [128]:
print(
    f'Logistic Reg Precision: {pipe_logi.score(X_test,y_test)}',
    f'\nRandomForest Class Precision: {pipe_rf.score(X_test,y_test)}',
    f'\nNaive Bayes Precision: {pipe_nb.score(X_test,y_test)}'
)

ValueError: Found input variables with inconsistent numbers of samples: [45, 105]

## 9. Deploy & Monitor

In [None]:
predict_lyrics = ['hello there these are some lyrics']

In [None]:
pipe.predict(predict_lyrics)

NameError: name 'pipe' is not defined