In [165]:
features_o_lyrics = ["genre","release_date","lyrics_ordered"]
features_u_lyrics = ["genre","release_date","lyrics_unordered"]
features_a_lyrics = ["genre","release_date","lyrics_ordered", "lyrics_unordered"]
features_all = ['release_date', 'genre', 'len', 'dating',
       'violence', 'world/life', 'night/time', 'shake the audience',
       'family/gospel', 'romantic', 'communication', 'obscene', 'music',
       'movement/places', 'light/visual perceptions', 'family/spiritual',
       'like/girls', 'sadness', 'feelings', 'danceability', 'loudness',
       'acousticness', 'instrumentalness', 'valence', 'energy', 'topic', 'age']
features_top5 = ['genre','release_date','danceability', 'age', 'acousticness', 'instrumentalness', 'energy']
features_top1 = ['genre','release_date','danceability']

In [121]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [122]:
#MODELS
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [123]:
def train_test_split_chronological(df_, year=2019, array_features=None):
    df = df_.copy()
    
    le = LabelEncoder()
    le.fit(np.unique(df["topic"]))
    df.loc[:, "topic"] = le.transform(df["topic"])
    
    le_genre = LabelEncoder()
    encoded_labels = le_genre.fit_transform(df["genre"])
    df["genre"] = encoded_labels
    label_map = dict(zip(le_genre.transform(le_genre.classes_),le_genre.classes_))
    
    if array_features is not None:
         df = df[array_features]
    
    test_df = df[df["release_date"] == year]
    train_df = df[df["release_date"] < year]
    
    test_df = test_df.drop("release_date",axis=1)
    train_df = train_df.drop("release_date",axis=1)
    
    test_y = test_df["genre"].astype(int)
    test_x = test_df.drop(["genre"], axis=1)
    train_y = train_df["genre"].astype(int)
    train_x = train_df.drop(["genre"], axis=1)
    return train_x,train_y,test_x,test_y,label_map

In [124]:
df = pd.read_csv("tcc_ceds_music.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [125]:
vectorizer = TfidfVectorizer()
vectorized_lyrics = vectorizer.fit_transform(df["lyrics"])
dense_lyrics = vectorized_lyrics.toarray()
df["lyrics_unordered"] = [e for e in dense_lyrics]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["lyrics"])
sequences = tokenizer.texts_to_sequences(df["lyrics"])
num_words = len(tokenizer.word_index)+1
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences,maxlen=max_length,padding="post")
df["lyrics_ordered"] = [e for e in padded_sequences]
#[[a/num_words for a in e] for e in padded_sequences]

df = df.drop(["lyrics","Unnamed: 0"],axis=1)

In [166]:
train_x,train_y,test_x,test_y,label_map = train_test_split_chronological(df, array_features=features_top1)

In [167]:
train_x

Unnamed: 0,danceability
0,0.357739
1,0.331745
2,0.456298
3,0.686992
4,0.291671
...,...
28247,0.796383
28248,0.839705
28249,0.468212
28250,0.826709


In [168]:
#train_x = pd.DataFrame(train_x["lyrics_unordered"].tolist())
#test_x = pd.DataFrame(test_x["lyrics_unordered"].tolist())

In [169]:
models = [
    #("Multinomial Naive Bayes", MultinomialNB()),
    ("Random Forest", RandomForestClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    #("MLP Classifier", MLPClassifier(hidden_layer_sizes=(128,))),
    #("K Nearest neighbours",KNeighborsClassifier()),
    ("Ensemble (Voting)", VotingClassifier(estimators=[('gb', GradientBoostingClassifier()),
                                                       ('rf', RandomForestClassifier()),
                                                       ('mlp', MLPClassifier(hidden_layer_sizes=(128,)))
                                                       ]))
]

In [170]:
model_stats = list()
for name,model in models:
    model.fit(train_x, train_y)
    pred_y = model.predict(test_x)
    accuracy = accuracy_score(test_y,pred_y)
    try:
        top_5 = list(train_x.columns[np.argsort(model.feature_importances_)][::-1][:5])
    except:
        top_5 = []
    model_stats.append((name,accuracy,top_5))

In [171]:
model_stats

[('Random Forest', 0.1947565543071161, ['danceability']),
 ('Gradient Boosting', 0.16666666666666666, ['danceability']),
 ('Ensemble (Voting)', 0.16292134831460675, [])]

In [115]:
ordered_lyrics_run = {'Multinomial Naive Bayes': 0.13857677902621723,
 'Random Forest': 0.20786516853932585,
 'Gradient Boosting': 0.32771535580524347,
 'MLP Classifier': 0.1348314606741573,
 'K Nearest neighbours': 0.17415730337078653,
 'Ensemble (Voting)': 0.2696629213483146}

In [114]:
all_run = {'Multinomial Naive Bayes': 0.2340823970037453,
 'Random Forest': 0.548689138576779,
 'Gradient Boosting': 0.5449438202247191,
 'MLP Classifier': 0.49063670411985016,
 'K Nearest neighbours': 0.32209737827715357,
 'Ensemble (Voting)': 0.5449438202247191}

In [163]:
[('Random Forest',
  0.5299625468164794,
  ['danceability', 'acousticness', 'age', 'instrumentalness', 'energy']),
 ('Gradient Boosting',
  0.5449438202247191,
  ['danceability', 'age', 'acousticness', 'instrumentalness', 'energy']),
 ('Ensemble (Voting)', 0.5617977528089888, [])]

[('Random Forest',
  0.5299625468164794,
  ['danceability', 'acousticness', 'age', 'instrumentalness', 'energy']),
 ('Gradient Boosting',
  0.5449438202247191,
  ['danceability', 'age', 'acousticness', 'instrumentalness', 'energy']),
 ('Ensemble (Voting)', 0.5617977528089888, [])]