In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import csv

In [2]:
lenc = LabelEncoder()

In [3]:
lyric_df = pd.read_csv('lyrics-data.csv',  usecols=[0,1,2,4], header = None, delimiter=",", quoting=csv.QUOTE_NONE, encoding='utf-8')
lyric_df.columns = ['alink', 'song', 'lyric', 'lang']
# for both, data has commas inside which is messing with delimiter, come back to this

In [None]:
lyric_df.shape

In [5]:
artist_df = pd.read_csv('artists-data.csv', usecols=[0,1,3,4,5],header = None, delimiter=",", quoting=csv.QUOTE_NONE, encoding='utf-8')
artist_df.columns = ['artist', 'songs', 'link', 'genre', 'genres']

In [None]:
artist_df.shape

In [None]:
artist_df.head
lyric_df.head

In [None]:
# just getting english lyrics and a temporary way of removing some of the dodgy data due to delimiter issue
lyric_df = lyric_df.loc[lyric_df['lang'] == 'ENGLISH']

# making a list of genres to match up with rows of df that can then add as a column
lyrics_genres = []
for index, row in lyric_df.iterrows():
  # below may be returning a series, currently just string casting may want to handle other way later
  main_genre = artist_df.loc[artist_df['link'] == row['alink']]['genre']
  lyrics_genres.append(str(main_genre))

# something not quite right getting multiple genres for some and none for others it seems

# adding genres to lyrics
lyric_df['genre'] = lyrics_genres


In [None]:
# now that there's genre column will encode to use as class/ifier
lyric_data_encoded = lyric_df.apply(lenc.fit_transform, axis=0)
lyric_data_encoded1 = lyric_data_encoded.loc[lyric_data_encoded['genre'] == 0]
lyric_data_encoded2 = lyric_data_encoded.loc[lyric_data_encoded['genre'] == 1]
lyric_data_encoded = pd.concat([lyric_data_encoded1, lyric_data_encoded2])
lyric_data_encoded.head
# because of genre extraction issues, there are more genres appearing than there should be, may filter out non standard, ignore etc 

In [10]:
encoded_lyric_data = lyric_data_encoded.values

In [None]:
# assigning lyrics and genre to info and class for later use
lyric_info = encoded_lyric_data[:, :-1]
lyric_classes = encoded_lyric_data[:, -1]

In [None]:
X_train,  X_test, y_train, y_test = train_test_split(
    lyric_info, lyric_classes, test_size=0.40, random_state=42
)

In [13]:
def get_prior_prob(y, label):
  total = y.shape[0]
  actual = np.sum(y == label)

  return total / actual


In [14]:
def conditional_prob(X_train, y_train, feature_col, feature_val, label):
  X_filtered = X_train[y_train == label]
  num = np.sum(X_filtered[:, feature_col] == feature_val)
  denom = X_filtered.shape[0]

  return num/denom


In [15]:
def predict(X_train, y_train, X_test):
  classes = np.unique(y_train)
  features = X_train.shape[1]
  
  posterior_prob = []

  for label in classes:
    chance = 1.0
    for feature in range(features):
      cond = conditional_prob(X_train, y_train, feature, X_test[feature], label)
      chance = chance * cond 
    prior = get_prior_prob(y_train, label)
    posterior = chance * prior
    posterior_prob.append(posterior)

    most_likely = np.argmax(posterior_prob)

    return most_likely

In [16]:
def get_accuracy(X_train, y_train, X_test, y_test):
  preds = []
  for i in range (X_test.shape[0]):
    pred = predict(X_train, y_train, X_test[i])
    preds.append(pred)
  class_preds = np.array(preds)
  
  accuracy = np.sum(class_preds == y_test)/ class_preds.shape[0]

  return accuracy 


In [17]:
acc = get_accuracy(X_train, y_train, X_test, y_test)

In [None]:
print(acc)