### Import all necessary libraries

In [None]:
#!pip install seaborn
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
import sklearn
import nltk.collocations 
from nltk import FreqDist, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import string, re
import urllib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
#nltk.download('punkt')
#nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


### Read CSV Data

In [None]:
song_list = pd.read_csv('380lyrics.csv')
charts = pd.read_csv('chart.csv')

### Remove Rows with Other and Not Available Genre

In [None]:
song_list.genre.value_counts()

In [None]:
song_list.drop(song_list[song_list.genre == "Other"].index, inplace = True)
song_list.drop(song_list[song_list.genre == "Not Available"].index, inplace = True)
song_list.drop(song_list[song_list.genre == "Indie"].index, inplace = True)
song_list.drop(song_list[song_list.genre == "Folk"].index, inplace = True)
song_list.drop(song_list[song_list.artist == "dolcenera"].index, inplace = True)
song_list.drop(song_list[song_list.artist == "brthhse-onkelz"].index, inplace = True)
song_list.shape

In [None]:
song_list.tail()

### Drop rows with NAN values for column song

In [None]:
song_list.dropna(inplace = True)
song_list.song.isna().sum()
song_list.reset_index(inplace = True)

In [None]:
song_list.tail()

### Clean Song titles to remove dashes and capitalize

In [None]:
def clean_song_names(titles_list):
    no_dot = list(map(lambda item: item.replace(".", ""), titles_list))
    no_dash = list(map(lambda item: item.replace("-", " "), no_dot))
    return list(map(lambda item: string.capwords(item), no_dash))

titles_list = song_list.song
final_titles = clean_song_names(titles_list)
len(final_titles)

### Append Clean Song names to DF as a new column and drop old column

In [None]:
new_df = song_list
# new_df.head()
new_titles = final_titles


new_lyr = pd.DataFrame(new_titles)
new_lyr.tail()

final_df = new_df.join(new_lyr)

final_df.drop(columns = ['song', 'level_0', "index"], axis = 1, inplace = True)


In [None]:
final_df.shape

In [None]:
final_df.rename(columns = {0: "song"}, inplace = True)
final_df.isna().sum()

In [None]:
final_df.shape

### Drop Song titles with Nan Values

In [None]:
final_df.genre.value_counts()

In [None]:
rock_df = final_df[final_df.genre == "Rock"][:2000]
pop_df = final_df[final_df.genre == "Pop"][:2000]
hip_df = final_df[final_df.genre == "Hip-Hop"][:2000]
metal_df = final_df[final_df.genre == "Metal"][:2000]
jazz_df = final_df[final_df.genre == "Jazz"][:2000]
elec_df = final_df[final_df.genre == "Electronic"][:2000]
country_df = final_df[final_df.genre == "Country"][:2000]
rnb_df = final_df[final_df.genre == "R&B"][:2000]


In [None]:
rock_df.head()

In [None]:
final_df.drop(final_df[final_df.genre == "Rock"].index, inplace = True)
final_df.drop(final_df[final_df.genre == "Pop"].index, inplace = True)
final_df.drop(final_df[final_df.genre == "Hip-Hop"].index, inplace = True)
final_df.drop(final_df[final_df.genre == "Metal"].index, inplace = True)
final_df.drop(final_df[final_df.genre == "Jazz"].index, inplace = True)
final_df.drop(final_df[final_df.genre == "Electronic"].index, inplace = True)
final_df.drop(final_df[final_df.genre == "Country"].index, inplace = True)


In [None]:
maybe_df = final_df.append([rock_df, pop_df, hip_df, metal_df, jazz_df, elec_df, country_df])
maybe_df.genre.value_counts()

In [None]:
maybe_df.drop(maybe_df[maybe_df.genre == "R&B"].index, inplace = True)
maybe_df = maybe_df.append([rnb_df])

In [None]:
maybe_df.genre.value_counts()

In [None]:
maybe_df.reset_index(inplace = True)

In [None]:
maybe_df.drop(columns = ["index"], axis = 1, inplace = True)

In [None]:
maybe_df.head()

In [None]:
lyrics_list = list(maybe_df.lyrics)

pattern = "([a-zA-Z]+(?:'[a-z]+)?)"

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
english = list(set(nltk.corpus.words.words()))

def clean_docs_lemma(lyrics_list):
    cleaned = []
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    for lyric in lyrics_list:
        clean_lyric = nltk.regexp_tokenize(lyric, pattern)
        lyric_lower = [i.lower() for i in clean_lyric]
        stop_words = stopwords.words('english')
        stopwords_list = stop_words + list(string.punctuation)
        stopwords_list += ["''", '""', '...', '``']
        lyrics_tokens_stopped = [w for w in lyric_lower if not w in stopwords_list]
        lyric_lemmas = [lemmatizer.lemmatize(word) for word in lyrics_tokens_stopped]
        c = " ".join(lyric_lemmas)
        cleaned.append(c)
    return cleaned

def clean_docs_stemma(lyrics_list):
    cleaned = []
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    for lyric in lyrics_list:
        clean_lyric = nltk.regexp_tokenize(lyric, pattern)
        lyric_lower = [i.lower() for i in clean_lyric]
        stop_words = stopwords.words('english')
        stopwords_list = stop_words + list(string.punctuation)
        stopwords_list += ["''", '""', '...', '``']
        lyrics_tokens_stopped = [w for w in lyric_lower if not w in stopwords_list]
        lyric_stemmas = [stemmer.stem(word) for word in lyrics_tokens_stopped]
        c = " ".join(lyric_stemmas)
        cleaned.append(c)
    return cleaned

### First thing we wanted to do is test whether Lemmatizing works better than Stemmatizing

### Lemmed Lyrics Test

In [None]:
lemmed_lyrics = clean_docs_lemma(lyrics_list)

In [None]:
len(set(nltk.word_tokenize(" ".join(lemmed_lyrics))))

In [None]:
X = lemmed_lyrics
y = maybe_df.genre

len(X)

In [28]:
### Split Data in 3 pieces
from sklearn.model_selection import train_test_split  
X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5, random_state=18)
        
len(y1), len(y2)

(8000, 8000)

In [29]:
from sklearn.model_selection import train_test_split  
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=18) 

### Round 1 - Train 3 weakest models

In [30]:
# Train Weakest Models

pipe_RF = Pipeline([('count_vectorizer', CountVectorizer()), 
                     ('tfidf_vectorizer', TfidfTransformer()),
                     ('clf', RandomForestClassifier())
                    ])

pipe_ADA = Pipeline([('count_vectorizer', CountVectorizer()), 
                     ('tfidf_vectorizer', TfidfTransformer()),
                     ('clf', AdaBoostClassifier(learning_rate=0.3))
                    ])


pipe_KNN = Pipeline([('count_vectorizer', CountVectorizer()), 
                     ('tfidf_vectorizer', TfidfTransformer()),
                     ('clf', KNeighborsClassifier())
                    ])

# List of pipelines, List of pipeline names
pipelines = [pipe_RF, pipe_ADA, pipe_KNN]
pipeline_names = ['Random Forest', 'ADA', "KNN"]

# Loop to fit each of the three pipelines
for pipe in pipelines:
    print(pipe)
    pipe.fit(X1_train, y1_train)

# Compare accuracies
X1_scores = []
for index, val in enumerate(pipelines):
    tup = (pipeline_names[index], val.score(X1_test, y1_test), val.predict_proba(X1_train), val.predict(X1_train))
    X1_scores.append(tup)
    print('%s pipeline test accuracy: %.3f' % (pipeline_names[index], val.score(X1_test, y1_test)))

Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
 ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])




Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
 ...m='SAMME.R', base_estimator=None,
          learning_rate=0.3, n_estimators=50, random_state=None))])
Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
 ...ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])
Random Forest pipeline test accuracy: 0.354
ADA pipeline test accuracy: 0.348
KNN pipeline test accuracy: 0.151


In [32]:
X1_scores[0]

('Random Forest', 0.354375, array([[0. , 0.1, 0. , ..., 0.4, 0.3, 0.1],
        [0. , 0. , 0.8, ..., 0.2, 0. , 0. ],
        [0. , 0.1, 0. , ..., 0. , 0.1, 0. ],
        ...,
        [0.7, 0.1, 0. , ..., 0. , 0.2, 0. ],
        [0.1, 0.8, 0. , ..., 0. , 0. , 0. ],
        [0.8, 0. , 0. , ..., 0. , 0. , 0.1]]), array(['Pop', 'Hip-Hop', 'Metal', ..., 'Country', 'Electronic', 'Country'],
       dtype=object))

In [33]:
#Creating Ensemble of 3 weakest learners - NB, Adaboost and KNN
classes = val.classes_
classes


R1_AVG_Scores = (X1_scores[0][2] + X1_scores[1][2] + X1_scores[2][2])/3
R1_df =  pd.DataFrame(R1_AVG_Scores, columns = [item + "_AVG" for item in classes])


In [34]:
from sklearn.model_selection import train_test_split  
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=18) 

### Round 2 Train Good learners

In [35]:
pipe_NB = Pipeline([('count_vectorizer', CountVectorizer()), 
                     ('tfidf_vectorizer', TfidfTransformer()),
                     ('clf', MultinomialNB())
                    ])

pipe_GBoost = Pipeline([('count_vectorizer', CountVectorizer()), 
                     ('tfidf_vectorizer', TfidfTransformer()),
                     ('clf', GradientBoostingClassifier(learning_rate=0.3))
                    ])

# List of pipelines, List of pipeline names
pipelines = [pipe_NB, pipe_GBoost]
pipeline_names = ['Multinomail NB', "Gboost"]

# Loop to fit each of the three pipelines
for pipe in pipelines:
    print(pipe)
    pipe.fit(X2_train, y2_train)

# Compare accuracies
X2_scores = []
for index, val in enumerate(pipelines):
    tup = (pipeline_names[index], val.score(X2_test, y2_test), val.predict_proba(X2_train), val.predict(X2_train))
    X2_scores.append(tup)
    print('%s pipeline test accuracy: %.3f' % (pipeline_names[index], val.score(X2_test, y2_test)))

Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
 ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
 ...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])
Multinomail NB pipeline test accuracy: 0.415
Gboost pipeline test accuracy: 0.414


In [36]:
X2_scores[0]

('Multinomail NB',
 0.415,
 array([[0.06476146, 0.35241613, 0.06331353, ..., 0.13243585, 0.08613732,
         0.10548437],
        [0.1981774 , 0.07245812, 0.04322267, ..., 0.10622388, 0.18648187,
         0.16461711],
        [0.22100511, 0.08076004, 0.0505977 , ..., 0.12314752, 0.11177447,
         0.0915945 ],
        ...,
        [0.19471283, 0.07157141, 0.03604731, ..., 0.09726023, 0.15274023,
         0.16414313],
        [0.11403141, 0.18961607, 0.15143814, ..., 0.09237297, 0.09056385,
         0.10338478],
        [0.0982226 , 0.19033551, 0.10705505, ..., 0.13630554, 0.1378852 ,
         0.10853184]]),
 array(['Electronic', 'Country', 'Jazz', ..., 'Country', 'Electronic',
        'Electronic'], dtype='<U10'))

In [37]:
R2_AVG_Scores = (X2_scores[0][2] + X2_scores[1][2])/2
R2_df =  pd.DataFrame(R2_AVG_Scores, columns = [item + "_AVG" for item in classes])

In [38]:
R1_df.shape == R2_df.shape

True

In [39]:
R1_df.head()

Unnamed: 0,Country_AVG,Electronic_AVG,Hip-Hop_AVG,Jazz_AVG,Metal_AVG,Pop_AVG,R&B_AVG,Rock_AVG
0,0.021688,0.077889,0.047074,0.077984,0.172722,0.381309,0.145245,0.076089
1,0.014405,0.123191,0.404214,0.014668,0.09337,0.258538,0.061224,0.030391
2,0.042194,0.141628,0.039385,0.042362,0.440424,0.17559,0.075555,0.042863
3,0.074281,0.109367,0.072252,0.04152,0.108542,0.208857,0.042202,0.34298
4,0.276824,0.074027,0.037418,0.07708,0.273547,0.176094,0.042204,0.042805


In [40]:
R2_df.head()

Unnamed: 0,Country_AVG,Electronic_AVG,Hip-Hop_AVG,Jazz_AVG,Metal_AVG,Pop_AVG,R&B_AVG,Rock_AVG
0,0.047782,0.517785,0.035567,0.081856,0.058145,0.099964,0.065361,0.09354
1,0.137993,0.057003,0.025317,0.192387,0.048993,0.110806,0.122537,0.304965
2,0.136361,0.068976,0.029945,0.392138,0.096692,0.101011,0.080609,0.094267
3,0.175803,0.081353,0.024063,0.201832,0.086792,0.206319,0.113154,0.110683
4,0.066228,0.065306,0.043323,0.045614,0.512806,0.090862,0.072373,0.103488


In [41]:
R3_df = pd.concat([R1_df, R2_df])

In [42]:
y3 = pd.concat([y1_train, y2_train])

In [46]:
y3.shape

(12800,)

In [47]:
sum(y3[0:6400] == y1_train), sum(y3[6400:] == y2_train)

(6400, 6400)

### Round 3 - Plug all AVG probabilities as features and train a final NN model

In [49]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers
from keras import optimizers

Using TensorFlow backend.


In [50]:
#Converting y_test to categorical

product = y3

le = preprocessing.LabelEncoder() #Initialize. le used as abbreviation fo label encoder
le.fit(product)
print("Original class labels:")
print(list(le.classes_))
print('\n')
product_cat = le.transform(product)  
#list(le.inverse_transform([0, 1, 3, 3, 0, 6, 4])) #If you wish to retrieve the original descriptive labels post production

print('New product labels:')
print(product_cat)
print('\n')


print('One hot labels; 7 binary columns, one for each of the categories.') #Each row will be all zeros except for the category for that observation.
product_onehot = to_categorical(product_cat)
print(product_onehot)
print('\n')

print('One hot labels shape:')
print(np.shape(product_onehot))

Original class labels:
['Country', 'Electronic', 'Hip-Hop', 'Jazz', 'Metal', 'Pop', 'R&B', 'Rock']


New product labels:
[5 2 4 ... 4 1 1]


One hot labels; 7 binary columns, one for each of the categories.
[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


One hot labels shape:
(12800, 8)


In [51]:
X3_train, X3_test, y3_train, y3_test = train_test_split(R3_df[:9000], product_onehot[:9000], test_size=0.2, random_state=123)  

# X_train = X_train.reset_index(drop=True)
# y_train = y_train.reset_index(drop=True)  


print(X3_train.shape, X3_test.shape, y3_train.shape, y3_test.shape)
y3_train

(7200, 8) (1800, 8) (7200, 8) (1800, 8)


array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [52]:
#instantiate NN mmodel

random.seed(123)
from keras import models
from keras import layers
model = models.Sequential()
model.add(layers.Dense(7, input_dim=8, kernel_initializer='normal', activation='tanh')) #2 hidden layers
model.add(layers.Dense(3, activation='tanh'))
model.add(layers.Dense(8, activation='softmax'))

In [53]:
model.compile(optimizer='SGD',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [54]:
model_val = model.fit(X3_train,
                    y3_train,
                    epochs=50,
                    batch_size=48,
                    validation_data=(X3_test, y3_test))

Train on 7200 samples, validate on 1800 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### Do final Test with 'Holdout' data

In [56]:
validation_score = model.evaluate(R3_df[9000:], product_onehot[9000:])
validation_score



[0.7857534348337274, 0.8039473685465361]

In [65]:
y_hat = model.predict_classes(R3_df[9000:])

In [61]:
tryit = list(le.inverse_transform(y_hat))
tryit[-10:]

['Electronic',
 'R&B',
 'Electronic',
 'Electronic',
 'Hip-Hop',
 'Metal',
 'Jazz',
 'Pop',
 'Electronic',
 'Metal']

In [67]:
y_hat[-10:]

array([5, 3, 1, 4, 6, 6, 4, 4, 1, 6])

In [68]:
actual = [np.where(item == 1)[0][0] for item in product_onehot[9000:]]
actual[-10:]

[7, 3, 1, 4, 6, 1, 4, 4, 1, 1]

In [69]:
actual_genre = list(le.inverse_transform(actual))
actual_genre[-10:]

['Rock',
 'Jazz',
 'Electronic',
 'Metal',
 'R&B',
 'Electronic',
 'Metal',
 'Metal',
 'Electronic',
 'Electronic']

In [70]:
count = 0
for item in list(zip(tryit, actual_genre)):
        if item[0] == item[1]:
            count += 1
            
count/len(tryit)

0.13

In [None]:
model_val_dict = model_val.history
model_val_dict.keys()

In [None]:
results_train = model.evaluate(X_train, y_train)
results_train

In [None]:
results_test = model.evaluate(X_test, y_test)
results_test

In [None]:
y_hat_test = model.predict(X_test)
result = y_test - y_hat_test
print(sum(sum(result)))

In [None]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(lemmed_lyrics, maybe_df.genre, test_size=0.2, random_state=18)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
modles_lem = [item[0] for item in lemmed_basic_scores]
accuracy_lem = [item[1] for item in lemmed_basic_scores]


import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from collections import namedtuple


n_groups = 5

means_men = (20, 35, 30, 35, 27)
std_men = (2, 3, 4, 1, 2)

means_women = (25, 32, 34, 20, 25)
std_women = (3, 5, 2, 3, 3)


fig, ax = plt.subplots(figsize = (12, 8))


index = np.arange(n_groups)
bar_width = 0.35

opacity = 0.5
error_config = {'ecolor': '0.3'}

lemmed_bars = ax.bar(index, accuracy_lem, bar_width,
                alpha=opacity, color='b',
                label='Lematized')

stemmed_bars = ax.bar(index + bar_width, accuracy_stem, bar_width,
                alpha=opacity, color='r',
                label='Stemmatized')

ax.set_xlabel('Model Type', fontsize = 14)
ax.set_ylabel('Accuracy Scores', fontsize = 14)
ax.set_title('Stemmed vs. Lemmed Accuracy Score Comparison', fontsize = 18)
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(modles_lem)

plt.axhline(y=1/len(set(y)), color='#17CA83', linestyle='-', label = "Random Guessing")
ax.legend()


plt.show()


### We decided to pick Lemmatized over Stemmatized and top three models for further optimization

### Next we want to try using PCA to improve performance and reduce dimentionality

In [None]:
# tfidf = TfidfVectorizer()
# response = tfidf.fit_transform(lemmed_lyrics)

# PCA_df = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())
# PCA_df.shape

In [None]:
# DATA = response  # this comes from above where you're vectorizing tdif dictionary

# non_zero_cols = DATA.nnz / float(DATA.shape[0])
# print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

# percent_sparse = 1 - (non_zero_cols / float(DATA.shape[1]))
# print('Percentage of columns containing 0: {}'.format(percent_sparse))

In [None]:
# PCA_df.head()

### Creating Features table and Target table and testing first model

In [None]:
# X_pca = PCA_df
# y_pca = maybe_df.genre

# len(X_pca) == len(y_pca)

In [None]:
# from sklearn.model_selection import train_test_split  
# X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_pca, test_size=0.2, random_state=18) 

### Try different PCA values and pick a number that preserves sufficient % of variation

In [None]:
# from sklearn.decomposition import PCA
# pca_1 = PCA(n_components=500)
# pca_2 = PCA(n_components=1000)
# pca_3 = PCA(n_components=1455)
# pca_4 = PCA(n_components=2000)

# principalComponents = pca_1.fit_transform(X_pca)
# principalComponents = pca_2.fit_transform(X_pca)
# principalComponents = pca_3.fit_transform(X_pca)
# principalComponents = pca_4.fit_transform(X_pca)

# print(np.sum(pca_1.explained_variance_ratio_))
# print(np.sum(pca_2.explained_variance_ratio_))
# print(np.sum(pca_3.explained_variance_ratio_))
# print(np.sum(pca_4.explained_variance_ratio_))

### We will test PCA witn n = 1800 on our top 3 models to see if it helps performance

In [None]:
# pipe_NB_pca = Pipeline([('pca', PCA(n_components=3000, random_state=18)),
#                      ('clf', GaussianNB())
#                     ])

# pipe_RF_pca = Pipeline([('pca', PCA(n_components=3000, random_state=18)),
#                      ('clf', RandomForestClassifier(n_jobs = -1))
#                    ])
                  
# pipe_GBoost_pca = Pipeline([('pca', PCA(n_components=1800, random_state=18)),
#                      ('clf', GradientBoostingClassifier(learning_rate=0.3))
#                     ])


# # List of pipelines, List of pipeline names
# pipelines = [pipe_NB_pca, pipe_RF_pca, pipe_GBoost_pca]
# pipeline_names = ['Multinomial NB', "Random Forest", "Gradient Boost"]

# # Loop to fit each of the three pipelines
# for pipe in pipelines:
#     print(pipe)
#     pipe.fit(X_train_pca, y_train_pca)

# # Compare accuracies
# PCA_scores = []
# for index, val in enumerate(pipelines):
#     tup = (pipeline_names[index], val.score(X_test_pca, y_test_pca))
#     lemmed_basic_scores.append(tup)
#     print('%s pipeline test accuracy: %.3f' % (pipeline_names[index], val.score(X_test_pca, y_test_pca)))

In [None]:
# PCA1800_results = dict(Multinomail_NB = 0.203,
# Gradient_boost = 0.422,
# Random_forest =0.290)



In [None]:
# n_groups = 3

# basic_mod_name = modles_lem[:3]
# basic_mod_acc = accuracy_lem[:3]
# pca_models_acc = [item[1] for item in PCA1800_results.items()]

# fig, ax = plt.subplots(figsize = (10,8))


# index = np.arange(n_groups)
# bar_width = 0.35

# opacity = 0.5
# error_config = {'ecolor': '0.3'}

# basic_bars = ax.bar(index, basic_mod_acc, bar_width,
#                 alpha=opacity, color='b',
#                 label='Basic Model')

# pca_bars = ax.bar(index + bar_width, pca_models_acc , bar_width,
#                 alpha=opacity, color='r',
#                 label='PCA n_components = 1800')

# ax.set_xlabel('Model Type', fontsize = 14)
# ax.set_ylabel('Accuracy Scores', fontsize = 14)
# ax.set_title('Basic Model vs. PCA with n = 1800 Model Comparison', fontsize = 18)
# ax.set_xticks(index + bar_width / 2)
# ax.set_xticklabels(basic_mod_name)

# plt.axhline(y=1/len(set(y)), color='#17CA83', linestyle='-', label = "Random Guessing")
# ax.legend()
# plt.show()

### We decided that it's not worth using PCA for our models because it increases computational time and doesn't really improve performance

## Let's use GridSearch to try to optimize our  top 3 models!

In [None]:
Top3_pipelines = [pipe_NB, pipe_GBoost, pipe_RF]
Top3_pipeline_names = ['Multinomail NB', "Gboost", 'Random Forest']

### Grid Search RandomForest

In [None]:
GS_pipe_RF = Pipeline([('count_vectorizer', CountVectorizer()), 
                     ('tfidf_vectorizer', TfidfTransformer()),
                     ('clf', RandomForestClassifier())
                    ])

sorted(GS_pipe_RF.get_params().keys())

rf_param_grid = dict(clf__n_estimators = [10, 30, 100], clf__criterion = ['gini', 'entropy'], 
                    clf__max_depth = [2, 6, 10], clf__min_samples_split = [5, 10],
                    clf__min_samples_leaf = [3, 6])


gs_RF = GridSearchCV(estimator=GS_pipe_RF,
            param_grid=rf_param_grid,
            scoring='accuracy',
            cv=3)

gs_RF.fit(X_train, y_train)

dt_gs_training_score = np.mean(gs_RF.cv_results_['mean_train_score'])
dt_gs_testing_score = gs_RF.score(X_test, y_test)

print("Mean Training Score: {:.4}%".format(dt_gs_training_score * 100))
print("Mean Testing Score: {:.4}%".format(dt_gs_testing_score * 100))
print("Best Parameter Combination Found During Grid Search: {}".format(gs_RF.best_params_))

In [None]:
Grid_RF_metrics = dict(train_score = dt_gs_training_score, test_score = dt_gs_testing_score, best_params = gs_RF.best_params_)
Grid_RF_metrics

### GridSearch Gboost

In [None]:
GS_pipe_GB = Pipeline([('count_vectorizer', CountVectorizer()), 
                     ('tfidf_vectorizer', TfidfTransformer()),
                     ('clf', GradientBoostingClassifier())
                    ])
# sorted(GS_pipe_RF.get_params().keys())

GB_params = {
    "clf__learning_rate": [0.2, 0.25],
    "clf__min_samples_split": [4, 5],
    "clf__min_samples_leaf": [6],
    "clf__max_depth":[3],
    "clf__n_estimators":[100, 150]
    }

gs_GB = GridSearchCV(estimator=GS_pipe_GB,
            param_grid=GB_params,
            scoring='accuracy',
            cv=3)

gs_GB.fit(X_train, y_train)

dt_GB_training_score = np.mean(gs_GB.cv_results_['mean_train_score'])
dt_GB_testing_score = gs_GB.score(X_test, y_test)

print("Mean Training Score: {:.4}%".format(dt_GB_training_score * 100))
print("Mean Testing Score: {:.4}%".format(dt_GB_testing_score * 100))
print("Best Parameter Combination Found During Grid Search: {}".format(gs_GB.best_params_))

In [None]:
Grid_GB_metrics = dict(train_score = dt_GB_training_score, test_score = dt_GB_testing_score, best_params = gs_GB.best_params_)
Grid_GB_metrics

In [None]:
hip_lyric = ["dropin my dough real quick. Data Science squad for the win. Drake got nothin on us!"]
jazz_lyric = ["humdinger, babababoo, bababaa from san francisco to georgia, we teach you to code like wah wah"]
rock_lyric = ['when I was young I thought code is not important. Now I learned that i need to know it if I want to grow.']
def test_genre(lyric):
    lemmed_test = clean_docs_lemma(lyric)
    print("This song is definetely {}!".format(gs_GB.predict(lemmed_test)[0]))

In [None]:
test_genre(rock_lyric)

In [None]:
basic_scores

In [None]:
final_top3 = [("GradientBoost with GridSearch", Grid_GB_metrics['test_score']), basic_scores[0], ("Random Forest with GridSearch", basic_scores[2][1])]

In [None]:
final_top3

In [None]:
top3_model = [item[0] for item in final_top3]
top3_scores = [item[1] for item in final_top3]

plt.figure(figsize=(10,6))
plt.title("Top3 Models Final Performance", color ='#061152' , fontsize = 20)
plt.ylabel("Accuracy Score", color = '#061152', fontsize = 16)
plt.bar(top3_model, top3_scores, color = "#17CA83", label = "Top 3 Models")

plt.axhline(y=1/len(set(y)), color='#AF2138', linestyle='-', label = "Random Guessing")
plt.legend()

plt.show()

In [None]:
top_mode_df = pd.DataFrame.from_dict(Grid_GB_metrics['best_params'])
top_mode_df

In [None]:
!pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

In [None]:
import random
test = random.sample(stemmed_lyrics, 7000)
lemmed_lyr = [nltk.word_tokenize(doc) for doc in test]

lemmed_lyr
# test
dictionary = gensim.corpora.Dictionary(lemmed_lyr)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

bow_corpus = [dictionary.doc2bow(doc) for doc in lemmed_lyr]

### Run Topic Classifier using BoW

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=15, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

### Using Topic Classifier Using TF-IDF

In [None]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=15, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

### WordCloud

In [None]:
weights = tfidf[bow_corpus[12]]

# Get terms from the dictionary and pair with weights

weights = [(dictionary[pair[0]], pair[1]) for pair in weights]
weights[-35:]



In [None]:
# !pip install wordcloud
from wordcloud import WordCloud

# Initialize the word cloud

d = {}
for a, x in weights:
    d[a] = x
    
wc = WordCloud(
    background_color="white",
    max_words=2000,
    width = 1024,
    height = 720,
    stopwords=stopwords.words("english")
)

# Generate the cloud

wc.generate_from_frequencies(d)
plt.figure(figsize = (12, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
d