In [1]:
import pandas as pd
import numpy as np


import sqlalchemy
from sqlalchemy import create_engine, inspect, Column, Integer, Float, String, Sequence, Boolean, text
from sqlalchemy.orm import Session
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.ext.declarative import declarative_base

In [2]:
#Using declarative base
Base = declarative_base()
engine = create_engine('sqlite:///lyrics3.db', echo = False)

In [3]:
#Creating the table metadata
Base.metadata.create_all(engine)
session = Session(engine)

In [4]:
#Binding engine to the metadata
meta = sqlalchemy.MetaData()
meta.reflect(bind = engine)

In [5]:
lyrics = pd.read_sql_table('lyrics', engine, index_col="id", coerce_float=False)
tracks = pd.read_sql_table('tracks', engine, index_col="id", coerce_float=False)
genre = pd.read_sql_table('genres', engine, index_col="id", coerce_float=False)

In [6]:
lyrics = lyrics[lyrics['lyrics_language']=='en']

In [7]:
genre['album_id']=genre['album_id'].astype(int)
genre.head()

Unnamed: 0_level_0,album_id,album_name,artist,genre
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,10266031,Big Willie Style,Will Smith,Rap
2,10266041,War,U2,Pop/Rock
3,10266127,Led Zeppelin,Led Zeppelin,Pop/Rock
4,10266180,Power Windows,Rush,Pop/Rock
5,10266202,Willennium,Will Smith,Rap


In [8]:
lyrics_tracks = pd.merge(tracks, lyrics, on="lyrics_id")
lyrics_tracks.head()

Unnamed: 0,album_id,artist_id,artist_name,explicit_x,has_lyrics,lyrics_id,track_id,track_name,explicit_y,lyrics_body,lyrics_language,lyrics_language_description
0,29219288,33491453,Drake,1,1,18083442,152383631,In My Feelings,1,"Trap, TrapMoneyBenny\nThis shit got me in my f...",en,English
1,29504205,36425860,"DJ Khaled feat. Justin Bieber, Quavo & Chance ...",1,1,18105233,154318893,No Brainer,1,We the Best Music!\nAnother one!\nDJ Khaled!\n...,en,English
2,29005911,36070710,Maroon 5 feat. Cardi B,1,1,18066274,150959640,Girls Like You,1,"Spent twenty-four hours, I need more hours wit...",en,English
3,29211432,36070710,Maroon 5 feat. Cardi B,1,1,18066274,152271189,Girls Like You,1,"Spent twenty-four hours, I need more hours wit...",en,English
4,28578797,35716462,Cardi B feat. Bad Bunny & J Balvin,1,1,17942796,148363403,I Like It,1,"Yeah baby, I like it like that\nYou gotta beli...",en,English


In [9]:
data = pd.merge(lyrics_tracks, genre, on="album_id")

In [10]:
data = data.sample(2000)

In [10]:
lyrics_genre = data[['artist_name','album_name','track_name','lyrics_body','genre']]

In [11]:
lyrics_genre['combined_text']= lyrics_genre['artist_name']+ " " + lyrics_genre['album_name']+" " + lyrics_genre['track_name']+" " + lyrics_genre['lyrics_body']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
lyrics_genre.head()

Unnamed: 0,artist_name,album_name,track_name,lyrics_body,genre,combined_text
0,Drake,Scorpion,In My Feelings,"Trap, TrapMoneyBenny\nThis shit got me in my f...",Rap,"Drake Scorpion In My Feelings Trap, TrapMoneyB..."
1,Drake,Scorpion,Nice For What,I wanna know who mothafuckin' representin' in ...,Rap,Drake Scorpion Nice For What I wanna know who ...
2,Drake,Scorpion,God's Plan,"Yeah, they wishin' and wishin' and wishin' and...",Rap,"Drake Scorpion God's Plan Yeah, they wishin' a..."
3,Drake,Scorpion,Nonstop,"Tay Keith, fuck these niggas up!\n\nLook, I ju...",Rap,"Drake Scorpion Nonstop Tay Keith, fuck these n..."
4,Drake,Scorpion,Survival,I been waitin' on this\nYeah\n\nAll of this di...,Rap,Drake Scorpion Survival I been waitin' on this...


In [13]:
lyrics_genre.drop(labels=['artist_name','album_name','track_name','lyrics_body'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
lyrics_genre.head()

Unnamed: 0,genre,combined_text
0,Rap,"Drake Scorpion In My Feelings Trap, TrapMoneyB..."
1,Rap,Drake Scorpion Nice For What I wanna know who ...
2,Rap,"Drake Scorpion God's Plan Yeah, they wishin' a..."
3,Rap,"Drake Scorpion Nonstop Tay Keith, fuck these n..."
4,Rap,Drake Scorpion Survival I been waitin' on this...


In [15]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [16]:
lyrics_genre.dropna(inplace=True)
lyrics_genre['is_english'] = lyrics_genre['combined_text'].apply(lambda x: isEnglish(x))
lyrics_genre.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,genre,combined_text,is_english
0,Rap,"Drake Scorpion In My Feelings Trap, TrapMoneyB...",True
1,Rap,Drake Scorpion Nice For What I wanna know who ...,True
2,Rap,"Drake Scorpion God's Plan Yeah, they wishin' a...",True
3,Rap,"Drake Scorpion Nonstop Tay Keith, fuck these n...",True
4,Rap,Drake Scorpion Survival I been waitin' on this...,False


In [17]:
lyrics_genre = lyrics_genre[lyrics_genre['is_english']==True]

In [18]:
#lyrics_genre['is_english'].value_counts()
lyrics_genre.drop('is_english',axis=1, inplace=True)
lyrics_genre.head()

Unnamed: 0,genre,combined_text
0,Rap,"Drake Scorpion In My Feelings Trap, TrapMoneyB..."
1,Rap,Drake Scorpion Nice For What I wanna know who ...
2,Rap,"Drake Scorpion God's Plan Yeah, they wishin' a..."
3,Rap,"Drake Scorpion Nonstop Tay Keith, fuck these n..."
5,Rap,"Drake Scorpion Elevate Ayy, elevate, elevate\n..."


In [19]:
#dependencies
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, PorterStemmer
import nltk
import string
import re
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
# defining the stopwords
stopword = stopwords.words('english')

#instantiating lemmatization and stemming objects
wn = WordNetLemmatizer()
ps = PorterStemmer()

In [21]:
def clean_text(text):
    '''
    Function accepts a text input and does three things:
    1. Removes punctuation
    2. Splits into tokens
    3. Removes tokens that are stopwords, conducts stemming, and joins together into a single string
    '''
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopword])
    return text

In [22]:
#cleans text using clean_text() function
lyrics_genre['body_text_clean'] = lyrics_genre['combined_text'].apply(lambda x: clean_text(x))

In [24]:
lyrics_genre.drop("combined_text", axis=1)
#lyrics_genre = pd.get_dummies(lyrics_genre[['genre']])
lyrics_genre.to_csv("genre_NB_wgenre.csv", encoding='utf-8')

In [24]:
ngram_vect = CountVectorizer(ngram_range=(1,2))

In [25]:
X_counts = ngram_vect.fit_transform(lyrics_genre['body_text_clean'])

In [26]:
print(X_counts.shape)

(1798, 67512)


In [27]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df.columns = ngram_vect.get_feature_names()

In [28]:
X_counts_df.head()

Unnamed: 0,007,007 im,03,03 stand,0f,0f lyric,10,10 buy,10 command,10 oclock,...,zone panic,zone peopl,zone pleasant,zone that,zoo,zoo station,zoom,zoom impend,zs,zs rock
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
y = lyrics_genre[['genre']]

y_genres = pd.get_dummies(y)
#y_genres = y_genres.values.reshape(-1,1)
print(y_genres)

      genre_Country  genre_Other  genre_Pop/Rock  genre_Rap
5463              0            0               1          0
3637              0            0               1          0
106               0            0               1          0
1058              1            0               0          0
7419              0            1               0          0
2746              0            0               1          0
3569              0            0               1          0
6886              0            1               0          0
3137              0            1               0          0
5837              0            0               1          0
1274              0            0               1          0
5988              0            0               1          0
653               0            0               1          0
7324              0            0               1          0
7170              1            0               0          0
4157              0            0        

In [42]:
#DO NOT RUN, X_counts is almost 200 mb
# y_genres.to_csv('y_vectorized_genres.csv', encoding='utf-8')
# X_counts_df.to_csv('X_counts_genres.csv', encoding='utf-8')

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_counts_df, y_genres, stratify=y)

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=50)

In [45]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [47]:
rf.score(X_test, y_test)

0.76000000000000001

In [53]:
import pickle

In [54]:
pickle.dump(rf, open('rf_genre_model','wb'))