In [18]:
import pandas as pd
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.corpus import stopwords
import gensim
from gensim.models import Word2Vec
#nltk.download('stopwords')

In [6]:
def remove_stop_words(text):
    word_list = text.split()
    without_stop_words = ' '.join(word for word in word_list if word not in stopwords.words('english'))
    return without_stop_words



In [7]:
genres = ['blues','country', 'electronic_complete', 'heavy-metal', 'hip-hop', 'indie', 'k-pop', 'pop',
               'punk', 'reggae', 'reggaeton_complete', 'rock', 'r-n-b', 'salsa_complete']

#genres = ['electronic_complete','reggaeton_complete','salsa_complete']

all_data = pd.DataFrame([])
for count,genre in enumerate(genres):
    try:
        data = pd.read_csv("data/{genre}_lyrics.csv".format(genre=genre))
    except:
        continue
    if count == 0:
        all_data = data
    else:
        all_data = pd.concat([all_data,data],axis=0)

In [8]:
df = all_data[~all_data['lyrics'].isnull()]
df['lyrics_n'] = df['lyrics'].apply(remove_stop_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lyrics_n'] = df['lyrics'].apply(remove_stop_words)


In [9]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2))
features = tfidf.fit_transform(df.lyrics).toarray()
labels = df.song_genre

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [10]:
X = df.lyrics_n
y = df.song_genre
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)


In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=genres))

accuracy 0.3998827323365582
                     precision    recall  f1-score   support

              blues       0.90      0.15      0.26       248
            country       0.29      0.86      0.44       281
electronic_complete       0.75      0.01      0.02       239
        heavy-metal       0.90      0.16      0.27       179
            hip-hop       0.56      0.80      0.66       268
              indie       0.24      0.22      0.23       282
              k-pop       0.34      0.86      0.49       173
                pop       0.26      0.35      0.30       288
               punk       0.23      0.03      0.05       234
             reggae       0.34      0.24      0.28       213
 reggaeton_complete       0.94      0.32      0.48       186
               rock       0.64      0.87      0.74       280
              r-n-b       0.21      0.24      0.22       261
     salsa_complete       0.91      0.36      0.52       279

           accuracy                           0.40     

In [12]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=genres))

accuracy 0.46672530049838756
                     precision    recall  f1-score   support

              blues       0.47      0.50      0.48       248
            country       0.52      0.70      0.60       281
electronic_complete       0.33      0.18      0.23       239
        heavy-metal       0.49      0.75      0.59       179
            hip-hop       0.54      0.82      0.65       268
              indie       0.23      0.11      0.14       282
              k-pop       0.35      0.98      0.52       173
                pop       0.32      0.11      0.16       288
               punk       0.28      0.21      0.24       234
             reggae       0.37      0.32      0.34       213
 reggaeton_complete       0.68      0.62      0.65       186
               rock       0.65      0.83      0.73       280
              r-n-b       0.24      0.17      0.20       261
     salsa_complete       0.79      0.48      0.60       279

           accuracy                           0.47    

In [13]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=genres))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy 0.4377015537965406
                     precision    recall  f1-score   support

              blues       0.50      0.42      0.46       248
            country       0.62      0.56      0.59       281
electronic_complete       0.30      0.27      0.28       239
        heavy-metal       0.64      0.54      0.58       179
            hip-hop       0.58      0.50      0.54       268
              indie       0.22      0.23      0.23       282
              k-pop       0.37      0.97      0.53       173
                pop       0.20      0.22      0.21       288
               punk       0.25      0.22      0.23       234
             reggae       0.30      0.32      0.31       213
 reggaeton_complete       0.82      0.49      0.61       186
               rock       0.78      0.77      0.77       280
              r-n-b       0.18      0.17      0.18       261
     salsa_complete       0.79      0.61      0.69       279

           accuracy                           0.44     

In [20]:
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

  wv.init_sims(replace=True)
