In [1]:
import pandas as pd
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import numpy as np
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer()
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from nltk import ConfusionMatrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from gensim.models import Doc2Vec
lr = LogisticRegression()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
rfc = RandomForestClassifier(n_jobs=-1)
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import jaccard_similarity_score
import pickle


In [2]:
songs_dataset = pd.read_json('MasterSongList.json')

In [3]:
songs_dataset = songs_dataset[songs_dataset['lyrics_features'].apply(lambda x: len(x)) > 0]

In [4]:
translator_object = str.maketrans('', '', punctuation)

In [5]:
def clean_text(raw_text):
    stemmed_words = []
    raw_text = ' '.join(raw_text)
    raw_text = raw_text.lower()
    translator_object = str.maketrans('', '', punctuation)
    lyric = raw_text.translate(translator_object)
    splitted_lyrics = lyric.split()
    
    for word in splitted_lyrics:
        if word not in ENGLISH_STOP_WORDS:
            stemmed_words.append(stemmer.stem(word))
    return ' '.join(stemmed_words)

In [6]:
cleaned_lyrics = songs_dataset.loc[:, 'lyrics_features'].apply(clean_text)

In [7]:
df_lyrics = cleaned_lyrics.to_frame('cleaned_lyrics')

In [8]:
df_lyrics['moods'] = songs_dataset['moods']
df_lyrics.dropna(how='any', inplace = True)
df_lyrics.reset_index(drop = True, inplace = True)

In [9]:
mlb = MultiLabelBinarizer()
y_labels = mlb.fit_transform(df_lyrics['moods'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_lyrics['cleaned_lyrics'], y_labels, test_size=0.1, random_state=101) 

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
text_clf = Pipeline([('vect', TfidfVectorizer()),
                      ('clf', OneVsRestClassifier(LinearSVC(C=100, class_weight='balanced')))])

In [12]:
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.01862464183381089


In [13]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.28      0.30      0.29       195
          1       0.10      0.13      0.11       118
          2       0.11      0.17      0.13       118
          3       0.08      0.11      0.09        63
          4       0.13      0.19      0.16       151
          5       0.15      0.11      0.13        45
          6       0.19      0.24      0.21       162
          7       0.08      0.12      0.10        88
          8       0.14      0.16      0.15        91
          9       0.14      0.27      0.19       201
         10       0.23      0.34      0.28       187
         11       0.11      0.15      0.13        75
         12       0.14      0.17      0.16       196
         13       0.03      0.05      0.04        20
         14       0.10      0.19      0.13       122
         15       0.11      0.17      0.14       144
         16       0.20      0.28      0.24       267
         17       0.05      0.08      0.06   

In [14]:
print(jaccard_similarity_score(y_train[1], y_pred[1]))

0.84375


In [15]:
y_train[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [16]:
mlb.inverse_transform(y_pred)

[('raw', 'sad'),
 ('angsty', 'atmospheric', 'cocky'),
 ('warm',),
 ('nocturnal', 'seductive'),
 ('introspective', 'mellow'),
 ('rowdy', 'spacey'),
 ('angsty', 'campy', 'celebratory', 'earthy', 'energetic', 'introspective'),
 ('energetic', 'nocturnal', 'sad', 'sweet'),
 ('cocky', 'gloomy', 'raw', 'rowdy', 'trashy'),
 ('aggressive', 'mellow', 'rowdy', 'visceral'),
 ('earthy', 'nocturnal', 'sprightly', 'sweet', 'warm'),
 ('angsty', 'celebratory', 'lush'),
 ('funky', 'warm'),
 ('lush', 'sexual', 'trashy'),
 ('mellow', 'warm'),
 ('energetic', 'funky', 'mellow'),
 ('motivational', 'sexual'),
 ('classy',),
 ('earthy', 'motivational'),
 ('cold', 'energetic', 'motivational', 'visceral'),
 ('celebratory', 'cocky', 'lush', 'mellow', 'nocturnal', 'visceral'),
 ('motivational', 'visceral'),
 ('atmospheric',
  'gloomy',
  'happy',
  'mellow',
  'nocturnal',
  'sad',
  'sweet',
  'trippy'),
 ('funky', 'lush'),
 ('celebratory', 'introspective', 'seductive'),
 ('mellow',),
 ('atmospheric',
  'earthy',


In [17]:
pickle.dump(text_clf, open('trained_lyrics.pickle', 'wb'))