In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

## Unsupervised Learning Capstone

Word2vec soloution, much slower and not as accurate.

In [2]:
df = pd.read_csv('lyrics.csv')
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [3]:
df.shape

(362237, 6)

In [4]:
df.isnull().sum()

index         0
song          2
year          0
artist        0
genre         0
lyrics    95680
dtype: int64

In [5]:
#Cleaning text, This should catch everything that spacys neural networks don't.
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    # Get rid of \n.
    text = re.sub(r"\n", " ", text)
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    return text

df.dropna(inplace=True)
df.lyrics = df.lyrics.apply(text_cleaner)

#Droping all rows with null values, the 10,000 rows with out lyrics can't be used
#and the two rows with out song ids won't be missed
#Also droping rows where lyrics were missing, but not NAN, and Rows where genre is not available/other
df.dropna(inplace=True)
df.drop((df[df.genre == 'Not Available'].index),inplace=True, axis=0)
df.drop((df[df.genre == 'Other'].index),inplace=True, axis=0)
df.drop((df[df.lyrics.str.len() < 1].index), inplace=True, axis=0)

#for now reducing data set to 10,000 to save time
df = df.sample(10000, random_state=42)
df.reset_index(inplace=True, drop=True)
df.drop('index', inplace=True, axis=1)

df.shape

(10000, 5)

In [6]:
#Language detection was out of the scope of this project so we're just doing a quick and dirty clean up
#with pythons' langdetect package. Droping all rows where language was not english.
from langdetect import detect

def detect_try(text):
    try:
        return detect(text)
    except:
        return 'error'

#Language column
df['language'] = df.lyrics.apply(detect_try)
print(df.language.value_counts())

#droping language column and rows that aren't english
df.drop((df[df.language != 'en'].index), inplace=True, axis=0)
df.drop('language', inplace=True, axis=1)

df.shape

en       9158
es        244
de        119
ro         73
id         64
fr         59
it         46
pt         39
sw         35
error      24
nl         20
fi         16
tr         14
tl         14
sv         12
no         10
hu          8
pl          7
da          6
ca          6
af          5
so          5
lt          4
sq          4
cy          3
hr          3
lv          1
vi          1
Name: language, dtype: int64


(9158, 5)

In [7]:
pd.set_option('display.max_colwidth', 500)
df.lyrics.head(2)

1    Slaves to the power of sin Once in captivity to darkness; spiritually dead A voice cried from the top of a hill "It is finished"! It awakened those who were once dead But now we live For us Now we live a new life in freedom Not being enslaved by the power of any Jesus' blood has made us free But the battle still remains We contend not with flesh and blood But with spiritual darkness For this we wage war But not against man This war is with darkness And not against flesh and blood The Holy Bo...
2    You better be nice, you better think twice, you better be careful You better be nice, you better think twice, you better be careful Are you dreaming, I can see it in your eyes Something tells me, you're about to say goodbye And I wonder, are you tired of my touch Once you held me now it doesn't mean as much You don't love me like you used to love me baby Body, soul and mind, hey hey hey Before you go, throw it all away, take some advice You better be nice, you better think twice, you b

In [8]:
#removing chained assignment warning
pd.options.mode.chained_assignment = None

#Getting Lemmas from our Spacy Docs
def doc2sentences(nlpdoc):
    sentences = []
    for sentence in nlpdoc.sents:
        lemmas = []
        for token in sentence:
            if token.pos_ == 'PRON':
                lemmas.append(token.text)
            elif token.is_punct:
                pass
            else:
                lemmas.append(token.lemma_)
        sentences.append(lemmas)
    return sentences


#Loading the spacy english model
#Using Spacy to parse lyrics and create docs
#This spacy model has 1.1 million word vectors that will help us generate our doc vectors.
nlp = spacy.load('en')
docs = [] 
sents = []

#Spacy pipeline docs
for doc in nlp.pipe(iter(df['lyrics']),batch_size=256):
    docs.append(doc)
    sents.append(doc2sentences(doc))
    
df['Lemmatized_sentences'] = sents
df.Lemmatized_sentences.head(2)

1    [[slave, to, the, power, of, sin, once, in, captivity, to, darkness, spiritually, dead, a, voice, cry, from, the, top, of, a, hill], [It, be, finish], [It, awaken, those, who, be, once, dead], [but, now, we, live, for, us], [now, we, live, a, new, life, in, freedom, not, be, enslave, by, the, power, of, any, jesus, blood], [have, make, us, free], [but, the, battle, still, remain], [We, contend, not, with, flesh, and, blood, but, with, spiritual, darkness, for, this, we, wage, war, but, not, ...
2    [[You, better, be, nice, you, better, think, twice, you, better, be, careful, You, better, be, nice, you, better, think, twice, you, better, be, careful, be, you, dream, I, can, see, it, in, -PRON-, eye], [something, tell, me, you, be, about, to, say, goodbye], [and, I, wonder, be, you, tired, of, -PRON-, touch], [once, you, hold, me, now, it, do, not, mean, as, much, You, do, not, love, me, like, you, use, to, love, me, baby, body, soul, and, mind], [hey, hey], [hey, before, you, go, 

In [9]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.models import word2vec


#This creates sentence vectors out of word2vec vectors and then creates doc vectors out of setnences
def document2vec(doc, vec_dim):
    sentence_vectors = []
    for sent in doc:
        #check if our sentence has any words in the google news vocab, if not return empty vector with zeros
        if any(word in sent for word in vocab):
            #For each word in our sentence return a word2vec vector to create a list of vectors
            #Then average the vectors together.
            vec = np.mean([model.word_vec(word) for word in sent if word in vocab], axis=0)
            sentence_vectors.append(vec)
        else:
            vec = np.zeros(vec_dim)
            sentence_vectors.append(vec)
    return np.mean(sentence_vectors, axis=0)

model = gensim.models.KeyedVectors.load_word2vec_format ('GoogleNews-vectors-negative300.bin', binary=True)

# List of words in model.
vocab = model.vocab.keys()

In [10]:
vectors = []
for doc in df.Lemmatized_sentences:
    vectors.append(document2vec(doc, 300))

#Create vector DF and concat to our original DF
vector_df = pd.DataFrame(vectors)
vector_df.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df = pd.concat([df, vector_df], axis=1)
df.head(2)

Unnamed: 0,song,year,artist,genre,lyrics,Lemmatized_sentences,0,1,2,3,...,290,291,292,293,294,295,296,297,298,299
0,flesh-and-blood,2007,deliverance,Metal,"Slaves to the power of sin Once in captivity to darkness; spiritually dead A voice cried from the top of a hill ""It is finished""! It awakened those who were once dead But now we live For us Now we live a new life in freedom Not being enslaved by the power of any Jesus' blood has made us free But the battle still remains We contend not with flesh and blood But with spiritual darkness For this we wage war But not against man This war is with darkness And not against flesh and blood The Holy Bo...","[[slave, to, the, power, of, sin, once, in, captivity, to, darkness, spiritually, dead, a, voice, cry, from, the, top, of, a, hill], [It, be, finish], [It, awaken, those, who, be, once, dead], [but, now, we, live, for, us], [now, we, live, a, new, life, in, freedom, not, be, enslave, by, the, power, of, any, jesus, blood], [have, make, us, free], [but, the, battle, still, remain], [We, contend, not, with, flesh, and, blood, but, with, spiritual, darkness, for, this, we, wage, war, but, not, ...",0.018069,0.056996,0.053011,0.08202,...,-0.078472,0.02486,-0.092086,0.00442,-0.039378,-0.002943,-0.012103,-0.049402,0.062549,-0.053977
1,be-careful,2007,dannii-minogue,Pop,"You better be nice, you better think twice, you better be careful You better be nice, you better think twice, you better be careful Are you dreaming, I can see it in your eyes Something tells me, you're about to say goodbye And I wonder, are you tired of my touch Once you held me now it doesn't mean as much You don't love me like you used to love me baby Body, soul and mind, hey hey hey Before you go, throw it all away, take some advice You better be nice, you better think twice, you better ...","[[You, better, be, nice, you, better, think, twice, you, better, be, careful, You, better, be, nice, you, better, think, twice, you, better, be, careful, be, you, dream, I, can, see, it, in, -PRON-, eye], [something, tell, me, you, be, about, to, say, goodbye], [and, I, wonder, be, you, tired, of, -PRON-, touch], [once, you, hold, me, now, it, do, not, mean, as, much, You, do, not, love, me, like, you, use, to, love, me, baby, body, soul, and, mind], [hey, hey], [hey, before, you, go, throw,...",-0.000483,0.010578,0.104912,0.130242,...,0.019077,0.165598,-0.149622,0.030454,-0.003322,-0.068253,0.024765,-0.087031,-0.027532,-0.014364


In [11]:
from sklearn.metrics import classification_report

def score_support(clf, X_train, y_train, X_test, y_test):
    print('train score:', clf.score(X_train, y_train))
    print('test score:', clf.score(X_test, y_test))

X = df.loc[:,~df.columns.isin(['song', 'year', 'artist', 'genre', 'lyrics', 'Lemmatized_sentences'])]
y = df.loc[:,'genre']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

In [12]:
from sklearn.svm import LinearSVC

svc = LinearSVC(C=1, random_state=42)
svc.fit(X_train, y_train)

score_support(svc, X_train, y_train, X_test, y_test)

train score: 0.5681135681135682
test score: 0.5393013100436681


In [13]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(max_depth=6, n_estimators=500, learning_rate=.05)
gbc.fit(X_train, y_train)

score_support(gbc, X_train, y_train, X_test, y_test)

train score: 0.9994539994539995
test score: 0.5584061135371179


In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn= KNeighborsClassifier(n_neighbors=10, weights='distance')
knn.fit(X_train, y_train)

score_support(knn, X_train, y_train, X_test, y_test)

train score: 0.9994539994539995
test score: 0.4497816593886463


In [15]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=5)
lr.fit(X_train, y_train)

score_support(lr, X_train, y_train, X_test, y_test)

train score: 0.5638820638820639
test score: 0.5403930131004366
