In [114]:
# Import pandas for data handling
import pandas as pd

# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Import text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold

#Import Regressor Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Import some ML helper function
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report


# Import our metrics to evaluate our model
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score


# Library for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as sparse

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
df = pd.read_csv('../data/labeled_lyrics_w_genres.csv')

# Inspecting The Data

In [15]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,genre
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,R&B
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,Pop
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,R&B
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,R&B
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,R&B


In [16]:
df.isnull().sum().sum()

0

In [17]:
df.duplicated().sum()

0

In [18]:
df.shape

(58100, 6)

In [19]:
df.genre.value_counts()

No_genre     21069
Pop          20691
Rock          9783
Country       2503
Rap           2311
R&B           1687
Non-Music       56
Name: genre, dtype: int64

### removing No_genre and Non-Music

In [20]:
df_dropped = df[(df['genre'] == 'No_genre') | (df['genre'] == 'Non-Music')].index
df.drop(df_dropped, inplace=True, axis='index')

In [21]:
print(df.shape)
df.head(15)

(36975, 6)


Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,genre
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,R&B
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,Pop
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,R&B
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,R&B
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,R&B
5,5,Elijah Blake,I just want to ready your mind\r\n'Cause I'll ...,Uno,0.321,R&B
7,7,Elis,Dieses ist lange her.\r\nDa ich deine schmalen...,Abendlied,0.333,Pop
8,8,Elis,A child is born\r\nOut of the womb of a mother...,Child,0.506,Pop
9,9,Elis,Out of the darkness you came \r\nYou looked so...,Come to Me,0.179,Pop
10,10,Elis,Each night I lie in my bed \r\nAnd I think abo...,Do You Believe,0.209,Pop


In [22]:
df.genre.value_counts()

Pop        20691
Rock        9783
Country     2503
Rap         2311
R&B         1687
Name: genre, dtype: int64

---

# Data Cleaning (Text Pre Processing)

In [23]:
# 1. function that makes all text lowercase.
def make_lowercase(test_string):
    return test_string.lower()

# 2. function that removes all punctuation. 
def remove_punc(test_string):
    test_string = re.sub(r'[^\w\s]', '', test_string)
    return test_string

# 3. function that removes all stopwords.
def remove_stopwords(test_string):
    # Break the sentence down into a list of words
    words = word_tokenize(test_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords. Stopwords was imported from nltk.corpus
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string

# 4. function to break words into their stem words
def stem_words(a_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        stemmed_word = porter.stem(word) #from nltk.stem import PorterStemmer
        
        # Append stemmed word to our valid_words
        valid_words.append(stemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

In [27]:
# Pipeline function 

def text_processing_pipeline(a_string):
    a_string = make_lowercase(a_string)
    a_string = remove_punc(a_string)
    #a_string = stem_words(a_string) #removing stem_words for now because making lyrics gibberish
    a_string = remove_stopwords(a_string)
    return a_string

In [28]:
# apply preprocessing pipeline 

df['seq_clean'] = df['seq'].apply(text_processing_pipeline)

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,genre,seq_clean
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,R&B,aint ever trapped bando oh lord dont get wrong...
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,Pop,drinks go smoke goes feel got let go cares get...
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,R&B,dont live planet earth found love venus thats ...
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,R&B,trippin grigio mobbin lights low trippin grigi...
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,R&B,see midnight panther gallant brave found found...


In [31]:
X = df['seq_clean'].values

y = df['label'].values

# Sampling smaller batches from dataframe for faster testing

In [34]:
#function to randomly sample n values from each genre for smaller random forest testing

def genre_sample(dataframe, k):
    #make an empty dataframe
    df_genre_sample = pd.DataFrame(columns = ['Unnamed: 0', 'artist', 'seq', 'song', 'label', 'genre', 'seq_clean'])
    
    genres = ['R&B', 'Pop', 'Rap', 'Rock', 'Country']
    for genre in genres:
         df_genre_sample = df_genre_sample.append((dataframe[dataframe["genre"]==genre].sample(n=k)))
    
    return df_genre_sample

In [99]:
# sampling from the dataframe, k is the # of samples from each genre

df_sampled = genre_sample(df, k=500)
print(df_sampled.shape)
df_sampled.head(10)

(2500, 7)


Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,genre,seq_clean
3227,3227,Jodeci,"Miggity, mix check one, two get ready uh huh\r...",Sweaty,0.404,R&B,miggity mix check one two get ready uh huh ima...
35207,52542,Prince,"Come here, baby \r\nCome and get your come on\...",One Kiss at a Time,0.173,R&B,come baby come get come oh see wanted one kiss...
50914,88062,Smokie Norful,You know I was looking at the conditon of the...,Same Sad Song,0.45,R&B,know looking conditon world thee day seems the...
43067,156368,Jaheim,Forever\nThe love I have for you will last\nFo...,Forever,0.203,R&B,forever love last forever loving always foreve...
55933,67442,Patti LaBelle,"We talk, it's always good\r\nSo much is alread...",Something More,0.334,R&B,talk always good much already understood spend...
4369,4369,The Brothers Johnson,Light up the night\r\nBurn out the dark\r\nLet...,Light Up the Night,0.926,R&B,light night burn dark lets set world fire ligh...
2515,2515,Little Walter,Blues with a feelin'\nThat's what I have today...,Blues With a Feeling,0.647,R&B,blues feelin thats today blues feelin thats to...
42244,91428,Justin Guarini,How much time has passed since you said goodby...,Condition of My Heart,0.394,R&B,much time passed since said goodbye got messag...
56397,90876,The Temptations,"No more water in the well, no more.\r\nNo more...",No More Water in the Well,0.916,R&B,water well water well take love granted like w...
8016,8016,Ray Charles,Am I blue? Am I blue?\r\nAin't these tears\r\n...,Am I Blue,0.316,R&B,blue blue aint tears eyes telling blue youll p...


In [100]:
#checking correct amounts of samples per genre were obtained

df_sampled.genre.value_counts()

Rap        500
Country    500
Pop        500
R&B        500
Rock       500
Name: genre, dtype: int64

In [96]:
# obtaining lyrics from one row for observation. 
# was seeing gibberish text with the use of the stemming function 

df_text_sample= df_sampled['seq_clean']
df_text_sample[47275]

'cmon cmon yeah cmon yeah uh ooooooooooooh uh oh oh oh uh ooooooooooooh uh oh oh oh uh ooooooooooooh uh oh oh oh uh ooooooooooooh partys let know dont trippin see us club show little love represent side like cause round slick pick hot one ride shotgun couple em got one belvedere rear club pulled dubs bout go buy bar sure aint playin hang lames hit park sayin ay party girls way bacardi models models talkin know cant forget thugs party girls party club party partys youre let hear say uh ooooooooooooh uh oh oh oh uh ooooooooooooh uh oh oh oh uh ooooooooooooh uh oh oh oh uh ooooooooooooh partys youre let know girls club best outfits showin skin tryna make nigga wan na spit girl friend need come back got locked white tshirt threepiece suit dont matter wear matters jiggy straight grindin club good time ay party girls way bacardi models models talkin know cant forget thugs party girls party club party partys youre let hear say uh ooooooooooooh uh oh oh oh uh ooooooooooooh uh oh oh oh uh ooooo

# Testing Regression Models for label prediction:
label = float scale (0-1) which signifies valence 



## Random Forest Regressor

to-do fine tuning:
- try word3vec instead of TF-IDF


### using the sampled dataset for faster testing

In [40]:
X_sampled = df_sampled['seq_clean'].values

y_sampled = df_sampled['label'].values

In [41]:
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sampled, y_sampled, 
                                                                             test_size=0.33, random_state=42)

In [42]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train_sample)

X_train_sample = vectorizer.transform(X_train_sample)
X_test_sample = vectorizer.transform(X_test_sample)

print(X_train_sample.shape, type(X_train_sample))

(1675, 18822) <class 'scipy.sparse.csr.csr_matrix'>


In [68]:
# function to find the best parameters for RandomForestRegressor

param_grid = {'n_estimators': [10, 50, 100, 500, 1000], 
              'bootstrap': [True, False],
             }

# there are more parameters to test but I was getting errors and need to investigate more

# param_grid = {'criterion': ['squared_error', 'absolute_error', 'poisson'],
#               'n_estimators': [10, 50, 100, 500, 1000], 
#               'max_depth': [2, 4, 8, 16, 32, 64], 
#               'min_samples_leaf': [1, 10, 25, 50],
#               'bootstrap': [True, False],
#               'min_samples_split': [0, 2, 4, 8, 16, 32]
#              }

In [69]:
print('Running Grid Search...')

# 1. Create a RandomForestRegressor model object without supplying arguments. 

rf_regressor = RandomForestRegressor()

# 2. Run a Grid Search with 3-fold cross-validation and assign the output to the object 'rf_grid'.
#    * Pass the model and the parameter grid to GridSearchCV()
#    * Set the number of folds to 3
#    * Specify the scoring method

rf_grid = GridSearchCV(estimator=rf_regressor, param_grid = param_grid, cv=3, scoring='r2')

# 3. Fit the model (use the 'grid' variable) on the training data and assign the fitted model to the 
#    variable 'rf_grid_search'

rf_grid_search = rf_grid.fit(X_train_sample, y_train_sample)


print('Done')

Running Grid Search...
Done


In [75]:
# finding best parameters for the Random Forest Regressor

best_score = rf_grid_search.best_score_
print("The best score is: ", best_score)

rf_best_params = rf_grid_search.best_params_
print("The best params is: ", rf_best_params)


The best score is:  0.05717528295973725
The best params is:  {'bootstrap': True, 'n_estimators': 1000}


In [76]:
#Optimal Hyperparameters for RandomForestRegressor based on GridSearchCV

rf_model = RandomForestRegressor(n_estimators=1000, bootstrap = True)

# 2. Fit the model to the training data below
rf_model.fit(X_train_sample, y_train_sample)

RandomForestRegressor(n_estimators=1000)

In [88]:
y_sample_pred = rf_model.predict(X_test_sample)

rf_mse = mean_squared_error(y_test_sample, y_sample_pred)
rf_r2 = r2_score(y_test_sample, y_sample_pred)

print('[RF] Mean Squared Error: {0}'.format(rf_mse))
print('[RF] R2: {0}'.format(rf_r2))

[RF] Mean Squared Error: 0.05222244210798196
[RF] R2: 0.10795588207769391


In [91]:
# Function to test the predictions of the model with NEW unseen text (not part of testing set)

def rgrg_string_test(lyrics):
    new_lyrics = text_processing_pipeline(lyrics)
    print("the processed lyrics are: ", new_lyrics)
    
    new_text_vectorized = vectorizer.transform([new_lyrics])
    
    value = rf_model.predict(new_text_vectorized)
    print("Random Forest Regressor model gives a value of: ", value)
    if(value < .50):
        print("which is negative")
    else: 
        print("which is positive")

In [95]:
test_text1 = "Hit me baby one more time my lonliness is killing me and I must confess I still believe"
test_text2 = "Oh, baby, when you talk like that You make a woman go mad So be wise and keep on Reading the signs of my body"
test_text3 = "looking out on the pouring rain I used to feel so uninspired"
test_text4 = "Girl put your record on tell me your favorite song just go ahead let your hair down"

rgrg_string_test(test_text1)
print('\n')
rgrg_string_test(test_text2)
print('\n')
rgrg_string_test(test_text3)
print('\n')
rgrg_string_test(test_text4)

the processed lyrics are:  hit baby one time lonliness killing must confess still believe
Random Forest Regressor model gives a value of:  [0.35903648]
which is negative


the processed lyrics are:  oh baby talk like make woman go mad wise keep reading signs body
Random Forest Regressor model gives a value of:  [0.52304362]
which is positive


the processed lyrics are:  looking pouring rain used feel uninspired
Random Forest Regressor model gives a value of:  [0.39058667]
which is negative


the processed lyrics are:  girl put record tell favorite song go ahead let hair
Random Forest Regressor model gives a value of:  [0.52279909]
which is positive


# Running Larger RF Test on 1687 samples from each Genre

- to-do: break this testing out into a function instead of repeating code 

In [101]:
# sampling from the dataframe, k is 1687 which is the max number of samples from R&B the smallest Genre pool 

df_sampled2 = genre_sample(df, k=1687)
print(df_sampled2.shape)
df_sampled2.head(10)

(8435, 7)


Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,genre,seq_clean
51095,17518,Donna Summer,Here I am on my own again\nThe days rush by\nT...,On My Honor,0.156,R&B,days rush nights seems slow guess ive let way ...
43146,156269,Jagged Edge,"[JD (JE)]\r\n(Girl I got it)\r\nShake, shake i...",I Got It,0.881,R&B,jd je girl got shake shake baby shake shake sh...
9224,144417,Freddie Hubbard,Skylark\r\nHave you anything to say to me?\r\n...,Skylark,0.103,R&B,skylark anything say wont tell love meadow mis...
38011,42985,T-Pain,I-I-I'm back!\nFreaknik's back baby!\n\nWhat's...,Freaknik Is Back,0.319,R&B,iiim back freakniks back baby whats happen mo ...
20314,134197,Sky,Cause I just\r\nCause I just\r\nCould you be h...,Push,0.269,R&B,cause cause could holding something maybe im y...
31346,156280,Jagged Edge,I see you sitting there\r\nLooking like you gl...,Dance Floor,0.737,R&B,see sitting looking like glued chair party goi...
49569,70519,Ray J,Is a precious lil girl n such a pretty lil gir...,Sex in the Rain,0.614,R&B,precious lil girl n pretty lil girl shes grown...
9933,35772,Otis Redding,I want to thank you for being so nice now \r\n...,I Want to Thank You,0.961,R&B,want thank nice want thank giving pride sweet ...
12183,120461,Tory Lanez,"Staring, looking at you from a long way\r\nPas...",High,0.387,R&B,staring looking long way passing ceilings keep...
32704,26366,Leonard Cohen,When it all went down\r\nAnd the pain came thr...,There for You,0.702,R&B,went pain came get dont ask know true get make...


In [102]:
#checking correct amounts of samples per genre were obtained

df_sampled2.genre.value_counts()

Rap        1687
R&B        1687
Country    1687
Pop        1687
Rock       1687
Name: genre, dtype: int64

In [103]:
X_sampled2 = df_sampled2['seq_clean'].values

y_sampled2 = df_sampled2['label'].values

In [104]:
X_train_sample2, X_test_sample2, y_train_sample2, y_test_sample2 = train_test_split(X_sampled2, y_sampled2, 
                                                                             test_size=0.33, random_state=42)

In [105]:
vectorizer2 = TfidfVectorizer()
vectorizer2.fit(X_train_sample2)

X_train_sample2 = vectorizer2.transform(X_train_sample2)
X_test_sample2 = vectorizer2.transform(X_test_sample2)

print(X_train_sample2.shape, type(X_train_sample2))

(5651, 35571) <class 'scipy.sparse.csr.csr_matrix'>


In [106]:
#Optimal Hyperparameters for RandomForestRegressor based on GridSearchCV

rf_model2 = RandomForestRegressor(n_estimators=1000, bootstrap = True)

# 2. Fit the model to the training data below
rf_model2.fit(X_train_sample2, y_train_sample2)

RandomForestRegressor(n_estimators=1000)

In [112]:
y_sample_pred2 = rf_model2.predict(X_test_sample2)
y_sample_pred2

rf_mse2 = mean_squared_error(y_test_sample2, y_sample_pred2)
rf_r2_2 = r2_score(y_test_sample2, y_sample_pred2)

print('[RF] Mean Squared Error: {0}'.format(rf_mse2))
print('[RF] R2: {0}'.format(rf_r2_2))

[RF] Mean Squared Error: 0.05101035414841959
[RF] R2: 0.1205852034338123


# conclusion on tesing Random Forest Regressors

Not much gained or lost from increase in sample size. 
R2 error increased by .02 which is not too significant 

on sample size 500/genre:
- [RF] Mean Squared Error: 0.05222244210798196
- [RF] R2: 0.10795588207769391

on sample size 1687/genre:
- [RF] Mean Squared Error: 0.05101035414841959
- [RF] R2: 0.1205852034338123

# To-Do: Run RF Test on Full Data Set 

In [32]:
# Split our data into testing and training like always. 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


# Save the raw text for later just incase
X_train_text = X_train
X_test_text = X_test

In [33]:
# Initialize our vectorizer
vectorizer = TfidfVectorizer()

# 3. Fit your vectorizer using your X data
# This makes your vocab matrix
vectorizer.fit(X_train)

# 4. Transform your X data using your fitted vectorizer. 
# This transforms your documents into vectors.
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape, type(X))
print(type(X_train))

(24773, 66588) <class 'numpy.ndarray'>
<class 'scipy.sparse.csr.csr_matrix'>


In [None]:
# ... to do overnight 

In [None]:
# ...

# ... next to try Gradient Boosting Regressor
Gradient boosting is a technique for repeatedly adding decision trees so that the next decision tree corrects the previous decision tree error.

In [115]:
# sampling from the dataframe, k is the # of samples from each genre

df_sampled3 = genre_sample(df, k=500)
print(df_sampled3.shape)
df_sampled3.head(10)


(2500, 7)


Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,genre,seq_clean
54843,13166,Prince,"Open your heart, open your mind\r\nA train is ...",Around the World in a Day,0.594,R&B,open heart open mind train leaving day wonderf...
18050,35787,Otis Redding,"I know you told me, long time ago\r\nThat you ...",A Fool for You,0.511,R&B,know told long time ago didnt want yeah didnt ...
22612,112277,New Edition,In our world all things must change\r\nJust as...,Introduction,0.0783,R&B,world things must change new edition rearrange...
41476,46953,Faith Evans,"Oh woh hoh, oh woh\r\nDoo doo doo doo, doo doo...",Maybe,0.605,R&B,oh woh hoh oh woh doo doo doo doo doo doo doo ...
55742,114539,Kem,Yesterday I missed you sad\nAnd today I miss y...,Miss You,0.343,R&B,yesterday missed sad today miss bad baby miss ...
39475,113690,K-Ci & JoJo,"Let's make love tonight, I'm in the mood to ma...",Honest Lover,0.315,R&B,lets make love tonight im mood make love repea...
50388,84260,Tina Turner,"When I was younger, so much younger than today...",Help,0.383,R&B,younger much younger today yeah never needed a...
38240,50803,Carla Thomas,I look at the mountain\r\nI look at the sun\r\...,A Love of My Own,0.435,R&B,look mountain look sun look everything mother ...
8296,8296,Brian McKnight,"Tell, tell me it isn't so\r\nYou say you have ...",Kiss Your Love Goodbye,0.144,R&B,tell tell isnt say reasons leaving go watch tu...
50828,99900,Raheem DeVaughn,"Bliss, magnificent, orgasmic\r\nLove can be al...",Complicated,0.475,R&B,bliss magnificent orgasmic love things say mak...


In [116]:
#checking correct amounts of samples per genre were obtained
df_sampled3.genre.value_counts()

Rap        500
Country    500
Pop        500
R&B        500
Rock       500
Name: genre, dtype: int64

In [118]:
# obtaining lyrics from one row for observation.
# was seeing gibberish text with the use of the stemming function

df_text_sample3 = df_sampled3['seq_clean']
df_text_sample3[8296]

'tell tell isnt say reasons leaving go watch turn walk away im searching find right words say hoping praying youll stay chorus ill kiss love goodbye guess wasnt meant gave youve tried see love caught eye went blind makes cry thing kiss love goodbye look deep inside find went wrong girl cause youll still gone oh ive tried though gave best determined leave theres nothing guess none dreams wont come true chorus wont ask wont ask give back pride cause time mend heart youve broken baby ill let go though love ill find way kiss love goodbye chorus guess wasnt meant gave youve tried see love caught eye went blind makes cry thing kiss love kiss love kiss love goodbye goodbye'

In [119]:
X_sampled3 = df_sampled3['seq_clean'].values
y_sampled3 = df_sampled3['label'].values

In [120]:
X_train_sample3, X_test_sample3, y_train_sample3, y_test_sample3 = train_test_split(X_sampled3, y_sampled3, 
                                                                                    test_size=0.33, random_state=42)

In [121]:
vectorizer3 = TfidfVectorizer()
vectorizer3.fit(X_train_sample3)

X_train_sample3 = vectorizer3.transform(X_train_sample3)
X_test_sample3 = vectorizer3.transform(X_test_sample3)

print(X_train_sample3.shape, type(X_train_sample3))

(1675, 19015) <class 'scipy.sparse.csr.csr_matrix'>


In [122]:
gb = GradientBoostingRegressor()

In [123]:
gb.fit(X_train_sample3, y_train_sample3)

GradientBoostingRegressor()

In [125]:
y_pred_sample3 = gb.predict(X_test_sample3)

In [126]:
gb_mse = mean_squared_error(y_test_sample3, y_pred_sample3)
gb_r2 = r2_score(y_test_sample3, y_pred_sample3)

print('For a sample size of 500 and NO hyperparameter tuning:')
print('[GB] Mean Squared Error: {0}'.format(gb_mse))
print('[GB] R2: {0}'.format(gb_r2))
print("\n Gradient Boosting Regressor produces same MSE as Random Forest but r2 has improved by .05")

For a sample size of 500 and NO hyperparameter tuning:
[GB] Mean Squared Error: 0.05567842687302583
[GB] R2: 0.1025863627581386

 Gradient Boosting Regressor produces same MSE as Random Forest but r2 has improved by .05


### Hyperparameter Tuning of Gradiet Boosting Regressor with GridSearchCV
- need to run overnight

In [124]:
gb_param_grid= {'n_estimators': [100, 1000, 1500],
                'learning_rate' : [0.1, 0.3, 0.5],
                'max_depth': [3, 8, 16, 32]
                }

In [None]:
print("Running Grid Search ... ")

gb_regressor = GradientBoostingRegressor()

gb_grid = GridSearchCV(estimator = gb_regressor, param_grid= gb_param_grid, cv=3, scoring= 'r2')

print("Running the fit..")

gb_grid_search = gb_grid.fit(X_train_sample3, y_train_sample3)

print("Done.")

best_score3 = gb_grid_search.best_score_
print("The best score is: ", best_score3)

gb_best_params = gb_grid_search.best_params_
print("The best parameters are: ", gb_best_params)

In [None]:
#end of regressor testing and end of new material 

In [None]:
#ignore everything below 

# Testing Classification Models for genre prediction:



In [None]:
X = df['seq_clean'].values
y = df['genre'].values

# Split our data into testing and training like always. 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


# Save the raw text for later just incase
X_train_text = X_train
X_test_text = X_test

# Initialize our vectorizer
vectorizer = TfidfVectorizer()

# 3. Fit your vectorizer using your X data
# This makes your vocab matrix
vectorizer.fit(X_train)

# 4. Transform your X data using your fitted vectorizer. 
# This transforms your documents into vectors.
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

# 5. Print the shape of your X.  53841 features (aka columns)
print(X_train.shape, type(X))

### Multinomial Naive Bayes

In [None]:
# Initalize our model.
mnb = MultinomialNB(alpha=.05)

# Fit our model with our training data.
mnb.fit(X_train, y_train)

# Make new predictions of our testing data. 
y_pred = mnb.predict(X_test)

# Make predicted probabilites of our testing data
y_pred_proba = mnb.predict_proba(X_test)

# Evaluate our model
accuracy =  mnb.score(X_test, y_test)

# Print our evaluation metrics
print("Model Accuracy: %f" % accuracy)

In [None]:
print(classification_report(y_test, y_pred, target_names=mnb.classes_))

In [None]:
# Plot the confusion matrix of our results
fig, ax = plt.subplots(figsize=(8, 8))

disp = plot_confusion_matrix(mnb, X_test, y_test,
                             display_labels=mnb.classes_,
                             cmap=plt.cm.Blues, ax=ax)
plt.xticks(rotation=90)
disp

### Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier()

# Fit our model with our training data.
rf_model.fit(X_train, y_train)


# Make new predictions of our testing data. 
y_pred = rf_model.predict(X_test)


# Make predicted probabilites of our testing data
y_pred_proba = rf_model.predict_proba(X_test)

# Evaluate our model
accuracy =  rf_model.score(X_test, y_test)

# Print our evaluation metrics
print("Model Accuracy: %f" % accuracy)

print(classification_report(y_test, y_pred, target_names=rf_model.classes_))

In [None]:
# Plot the confusion matrix of our results
fig, ax = plt.subplots(figsize=(8, 8))

disp = plot_confusion_matrix(rf_model, X_test, y_test,
                             display_labels=rf_model.classes_,
                             cmap=plt.cm.Blues, ax=ax)
plt.xticks(rotation=90)
disp

# Observations:
#### RF & MNB most confused:
- Pop with Rock (~1500 to 1800)  
- Pop with Country (~500)

In [None]:
# Models to test: RandomForest, DT, MultinomialNB, KNN


### KNN 

In [None]:
def train_test_knn(X_train, X_test, y_train, y_test, k):
    '''
    Fit a k Nearest Neighbors classifier to the training data X_train, y_train.
    Return the accuracy of resulting predictions on the test data.
    '''
    
    # 1. Create the  KNeighborsClassifier model object below and assign to variable 'model'
    model = KNeighborsClassifier(n_neighbors = k)

    # 2. Fit the model to the training data below
    model.fit(X_train, y_train)
    
    # 3. Make predictions on the test data below and assign the result to the variable 'class_label_predictions'
    class_label_predictions = model.predict(X_test)

    # 4. Compute the accuracy here and save the result to the variable 'acc_score'
    acc_score = accuracy_score(y_test, class_label_predictions)
    
    
    return acc_score

In [None]:
k_values = [10, 100, 1000, 10000]

acc1 = [] 

for k in k_values:
    score = train_test_knn(X_train, X_test, y_train, y_test, k)
    print('k=' + str(k) + ', accuracy score: ' + str(score))
    acc1.append(float(score))
  

In [None]:
knn = KNeighborsClassifier(n_neighbors = 100)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, target_names=knn.classes_))

In [None]:
# Trying to improve knn with KFold Cross Validation

num_folds = 5
folds = KFold(n_splits = num_folds, random_state=None)

acc_scores = []

for train_row_index, test_row_index in folds.split(X_train): 
    
    # our new partition of X_train and X_val
    X_train_new = X_train.iloc[train_row_index] # cannot use iloc because X_train is matrix not pandas object
    X_val = X_train.iloc[test_row_index]
    
    # our new partition of y_train and y_val
    y_train_new = y_train.iloc[train_row_index]
    y_val = y_train.iloc[test_row_index]
    
    knn_model.fit(X_train_new, y_train_new)
    predictions = knn_model.predict(X_val)
     
    iteration_accuracy = accuracy_score(predictions , y_val)
    acc_scores.append(iteration_accuracy)
     
        
for i in range(len(acc_scores)):
    print('Accuracy score for iteration {0}: {1}'.format(i+1, acc_scores[i]))

avg_scores = sum(acc_scores)/num_folds
print('\nAverage accuracy score: {}'.format(avg_scores))

In [None]:
type(X_train)

# To-Do research improving KNN
- cannot use iloc in KFold splitting because X_train is matrix not pandas object
- Read this:
    - https://towardsdatascience.com/text-classification-using-k-nearest-neighbors-46fa8a77acc5

# Summary So Far for Genre Classification:
- Multinomial Naive Bayes
    - Model Accuracy: 0.603110
- Random Forest Classifier
    - Model Accuracy: 0.596078
- Knn
    - where k=1000, accuracy score: 0.5843137254901961