In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import csv

In [2]:
lenc = LabelEncoder()

In [3]:
lyric_df = pd.read_csv('lyrics-data.csv',  usecols=[0,1,2,4], header = None, delimiter=",", quoting=csv.QUOTE_NONE, encoding='utf-8')
lyric_df.columns = ['alink', 'song', 'lyric', 'lang']
# for both, data has commas inside which is messing with delimiter, come back to this

In [4]:
lyric_df.shape

(210263, 4)

In [5]:
artist_df = pd.read_csv('artists-data.csv', usecols=[0,1,3,4,5],header = None, delimiter=",", quoting=csv.QUOTE_NONE, encoding='utf-8')
artist_df.columns = ['artist', 'songs', 'link', 'genre', 'genres']

In [6]:
artist_df.shape

(3243, 5)

In [7]:
artist_df.head
lyric_df.head

<bound method NDFrame.head of                    alink  ...                                               lang
0                  ALink  ...                                              Idiom
1        /10000-maniacs/  ...                                            ENGLISH
2        /10000-maniacs/  ...                                               baby
3        /10000-maniacs/  ...   I promise. Will the whole world be warm as th...
4        /10000-maniacs/  ...   ""O my mountain has coal veins and beds to di...
...                  ...  ...                                                ...
210258  /zeca-pagodinho/  ...                     iaiá. Você me jogou um feitiço
210259  /zeca-pagodinho/  ...                         um desejo a mais. Veja bem
210260  /zeca-pagodinho/  ...                                              palma
210261  /zeca-pagodinho/  ...   cadê a samba?. Está mangando na curimba. Está...
210262  /zeca-pagodinho/  ...                                                 "

In [8]:
# just getting english lyrics and a temporary way of removing dodgy data due to delimiter issue
lyric_df = lyric_df.loc[lyric_df['lang'] == 'ENGLISH']

# making a list of genres to match up with rows of df that can then add as a column
lyrics_genres = []
for index, row in lyric_df.iterrows():
  # below may be returning a series, currently just string casting may want to handle other way later
  main_genre = artist_df.loc[artist_df['link'] == row['alink']]['genre']
  lyrics_genres.append(str(main_genre))

# something not quite right getting multiple genres for some and none for others it seems
# print(lyrics_genres[250:260])

# adding genres to lyrics
lyric_df['genre'] = lyrics_genres


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [9]:
# now that there's genre column will encode to use as class/ifier
# print(lyric_df.loc(['genre']))
lyric_df.head
# lyric_df = lyric_df.loc[(lyric_df['genre'] == 'Rock') | (lyric_df['genre'] == 'Pop')]
lyric_data_encoded = lyric_df.apply(lenc.fit_transform, axis=0)
lyric_data_encoded1 = lyric_data_encoded.loc[lyric_data_encoded['genre'] == 0]
lyric_data_encoded2 = lyric_data_encoded.loc[lyric_data_encoded['genre'] == 1]
lyric_data_encoded = pd.concat([lyric_data_encoded1, lyric_data_encoded2])
lyric_data_encoded.head
# because of genre extraction issues, there are more genres appearing than there should be, may filter out non standard, ignore etc 

<bound method NDFrame.head of         alink   song  lyric  lang  genre
1           0   7330     16     0      0
8           0   8521     21     0      0
10          0    668      1     0      0
12          0    890      3     0      0
16          0   6934     15     0      0
...       ...    ...    ...   ...    ...
114663    244  11256   3770     0      1
114664    244  11466   3771     0      1
114671    244  12724   3772     0      1
114672    244  13158   3773     0      1
114673    244  13184   3774     0      1

[94 rows x 5 columns]>

In [10]:
encoded_lyric_data = lyric_data_encoded.values

In [11]:
# assigning lyrics and genre to info and class for later use
lyric_info = encoded_lyric_data[:, :-1]
# print(lyric_info.shape)
lyric_classes = encoded_lyric_data[:, -1]
lyric_classes
print(lyric_info.shape[1])

4


In [12]:
X_train,  X_test, y_train, y_test = train_test_split(
    lyric_info, lyric_classes, test_size=0.40, random_state=42
)
print(X_test)

[[  244    66  3750     0]
 [    0  6934    15     0]
 [  244  9435  3765     0]
 [  244  2260  3753     0]
 [    0  7330    16     0]
 [    0  8222    19     0]
 [  244  2260  3753     0]
 [  244 10860  3769     0]
 [    0   890     3     0]
 [  244  3626  3754     0]
 [  244  9389  3764     0]
 [    0 11226    28     0]
 [  244 11466  3771     0]
 [  244 12724  3772     0]
 [    0  2206     5     0]
 [    0  6934    15     0]
 [    0  4239    11     0]
 [    0  8521    21     0]
 [  244  6635  3759     0]
 [  244  6991  3762     0]
 [    0  2915     8     0]
 [  244  2260  3753     0]
 [  244  6635  3759     0]
 [    0  9254    23     0]
 [    0 10856    27     0]
 [    0  1724     4     0]
 [  244 10817  3768     0]
 [  244    66  3750     0]
 [    0  9370    24     0]
 [  244  6315  3758     0]
 [    0   722     2     0]
 [  244  1345  3751     0]
 [    0  1724     4     0]
 [  244  1487  3752     0]
 [  244  5616  3757     0]
 [    0  3244     9     0]
 [  244  3978  3755     0]
 

In [13]:
# for/using training data
def get_prior_prob(y, label):
  total = y.shape[0]
  actual = np.sum(y == label)

  return total / actual


In [14]:
# for/using training data
def conditional_prob(X_train, y_train, feature_col, feature_val, label):
  X_filtered = X_train[y_train == label]
  num = np.sum(X_filtered[:, feature_col] == feature_val)
  denom = X_filtered.shape[0]

  return num/denom


In [15]:
def predict(X_train, y_train, X_test):
  classes = np.unique(y_train)
  features = X_train.shape[1]
  # print(features)
  # prob per word to be certain class
  posterior_prob = []

  for label in classes:
    chance = 1.0
    for feature in range(features):
      cond = conditional_prob(X_train, y_train, feature, X_test[feature], label)
      chance = chance * cond 
    prior = get_prior_prob(y_train, label)
    posterior = chance * prior
    posterior_prob.append(posterior)

    most_likely = np.argmax(posterior_prob)

    return most_likely

In [16]:
def get_accuracy(X_train, y_train, X_test, y_test):
  preds = []
  for i in range (X_test.shape[0]):
    pred = predict(X_train, y_train, X_test[i])
    preds.append(pred)
  class_preds = np.array(preds)
  
  accuracy = np.sum(class_preds == y_test)/ class_preds.shape[0]

  return accuracy 


In [17]:
acc = get_accuracy(X_train, y_train, X_test, y_test)

In [18]:
print(acc)

0.47368421052631576
