# Model Building on Lyrics as Features
This is the code we ran for attempting to build a model that classifies only based on the words alone, no frequent patterns mined\.  
We wanted to compare the accuracy to our frequent pattern model to see if they indeed made a difference.

In [None]:
import pyarc
import fim

In [None]:
import pandas as pd

In [None]:
import numpy as np
np.random.RandomState(42)

In [None]:
tracks=pd.read_csv("my_dataset.csv",encoding='latin1')

In [None]:
tracks

Complete same pre-processing steps we used for the frequent pattern model.

In [None]:
words=['i','you','they','we','he','she','it','a','and','the','to','me','not','in','of','on','about','your','that','do','am','for','no','here','there','oh','my','is','are']
indices=np.empty([1],dtype='int64')
for word in words:
    temp=np.concatenate((indices,tracks[tracks['word']==word].index.values))
    indices=temp
tracks2=tracks.drop(index=indices[1:])

In [None]:
singletons=tracks2.groupby('word').sum('count')
bbb=singletons[singletons['count']==1].index.get_level_values(0)
indices=np.empty([1],dtype='int64')
for word in bbb:
    temp=np.concatenate((indices,tracks[tracks['word']==word].index.values))
    indices=temp
tracks3=tracks2.drop(indices[1:])

In [None]:
all_tracks = [track[1]['word'].tolist() for track in list(tracks3.groupby(['track_id','title']))]
lyrics = [track[1]['genre'].tolist()[0] for track in list(tracks3.groupby(['track_id','title']))]
lyrics = pd.DataFrame(lyrics, columns = ["genre"])
lyrics["words"] = all_tracks

In [None]:
lyrics

In [None]:
from mlxtend.preprocessing import TransactionEncoder
te1=TransactionEncoder()
te_ary = te1.fit(lyrics['words']).transform(lyrics['words'])

In [None]:
te_ary

In [None]:
df = pd.DataFrame(te_ary)

In [None]:
X = df
y = lyrics['genre']

Split data and start building models.

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,train_size=0.8,random_state=42)

In [None]:
from sklearn.svm import SVC
svc=SVC(random_state=42)
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)

In [None]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test,y_pred)

In [None]:
from sklearn.linear_model import SGDClassifier
sgd=SGDClassifier(random_state=42)
sgd.fit(X_train,y_train)
y_pred=sgd.predict(X_test)
balanced_accuracy_score(y_test,y_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb.fit(X_train,y_train)
y_pred=gnb.predict(X_test)
balanced_accuracy_score(y_test,y_pred)

The models ended up not performing well at all. We tried hyperparameter tuning on our two best models, SGD and Gaussian NB, to see if that was the issue.

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

params = {
    "loss" : ["hinge", "log_loss", "modified_huber"],
    "alpha" : [0.0001, 0.001, 0.01, 0.1, 1.0],
    "penalty" : ["l2", "l1",None],
}

model = SGDClassifier(max_iter=1000,random_state=42)
clf = GridSearchCV(model, param_grid=params, scoring = "balanced_accuracy", cv=3, verbose=2)
clf.fit(X_train,y_train)

In [None]:
clf.best_score_

In [None]:
clf.best_params_

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

params = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

gnb = GaussianNB()
clf = GridSearchCV(gnb, param_grid=params, scoring = "balanced_accuracy", cv=3, verbose=2)
clf.fit(X_train,y_train)

In [None]:
clf.best_score_

In [None]:
clf.best_params_

Even with hyperparameter tuning the models were still very inaccurate in their predictions. This is a good sign that the our frequent patterns mined are actually good predictors for genre of a song.