In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
import en_core_web_md
import pickle
from pyfiglet import Figlet




In [2]:
artists_list=[]
lyrics_df = pd.DataFrame()

In [3]:
filepathlist = ["Eminem.json", "TheKooks.json", "Mumford&Sons.json"]
artistlist = ["Eminem", "The Kooks", "Mumford & Sons"]

In [4]:
def custom_tokenizer(text):
    """
    converts a string into a text of tokens using spacy 
    """
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or t.is_oov or not t.is_alpha):
            tokens.append(t.lemma_)
    return tokens 

In [5]:
def read_lyrics_to_df(filename, artistname):
    '''
    Reads a previous scraped and saved json file with lyrics into a dataframe
    and appends artistname to artists_list for later use.
    '''
    df = pd.read_json(filename)
    df["artist"] = artistname
    if artistname not in artists_list:
        artists_list.append(artistname)
    df = pd.concat([lyrics_df, df], axis=0)
    return df

In [15]:
def create_model(filepathlist, artistlist):
    '''
    Creates a NB-model & Bag-of-words and saves them locally.
    '''
    lyrics_df = pd.DataFrame()
    for fp, a in zip(filepathlist, artistlist):
        print(f"{fp}, {a}")
        lyrics_df = pd.concat([lyrics_df, read_lyrics_to_df("../data/"+str(fp), str(a))])
        print(lyrics_df.shape)
    y = lyrics_df["artist"]
    X = lyrics_df["lyrics"]
    print("y:")
    print(y)
    print("x:")
    print(X)

    bow = CountVectorizer(tokenizer=custom_tokenizer,
                          ngram_range=(1, 3), 
                          min_df=0.01, 
                          max_df=0.99)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    X_train = bow.fit_transform(X_train)
    X_test = bow.transform(X_test)

    m = Pipeline([
        ('TfIdf', TfidfTransformer()),
        ('NB', MultinomialNB())
    ])
    m.fit(X_train, y_train)

    with open('../model/bow.p', 'wb') as f:
        pickle.dump(bow, f)
    with open('../model/model.p', 'wb') as f:
        pickle.dump(m, f)
    print(f"Test acc:{metrics.accuracy_score(y_test, m.predict(X_test))}")
    print(f"Train acc:{metrics.accuracy_score(y_train, m.predict(X_train))}")

    
#    print(figlet.renderText('Model successfully created'))
    return

In [16]:
nlp = en_core_web_md.load(disable=["parser", "textcat", "ner"])
create_model(filepathlist, artistlist)

Eminem.json, Eminem
(479, 3)
TheKooks.json, The Kooks
(793, 3)
Mumford&Sons.json, Mumford & Sons
(1121, 3)
y:
0              Eminem
1              Eminem
2              Eminem
3              Eminem
4              Eminem
            ...      
323    Mumford & Sons
324    Mumford & Sons
325    Mumford & Sons
326    Mumford & Sons
327    Mumford & Sons
Name: artist, Length: 1121, dtype: object
x:
0      Son, you know why you the greatest alive?\r\nW...
1      Nigga, I wear fur coats in the summer, nigga\r...
2      Yeah, I was born a misfit, grew up ten miles f...
3      Yeah huh\r\n(hip hop hip hop hip hop)\r\nTurn ...
4      You will not be able to stay home brotha\r\nYo...
                             ...                        
323    My generation's stuck in the mirror\r\n"Forget...
324    Weep for yourself, my man,\r\nYou'll never be ...
325    I know the time has numbered my days,\r\nAnd I...
326    Though I may speak some tongue of old\r\nOr ev...
327    Though I may speak some to

In [None]:
for fp, a in zip(filepathlist, artistlist):
    lyrics_df = (read_lyrics_to_df("../data/"+str(fp), str(a)))
    y = lyrics_df["artist"]
    X = lyrics_df["lyrics"]

In [None]:
lyrics_df

In [None]:
nlp = en_core_web_md.load(disable=["parser", "textcat", "ner"])


bow = CountVectorizer(tokenizer=custom_tokenizer,
                      ngram_range=(1, 3), 
                      min_df=0.01, 
                      max_df=0.99)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train = bow.fit_transform(X_train)
X_test = bow.transform(X_test)

m = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', MultinomialNB())
])
m.fit(X_train, y_train)

print(f"Test acc:{metrics.accuracy_score(y_test, m.predict(X_test))}")
print(f"Train acc:{metrics.accuracy_score(y_train, m.predict(X_train))}")
