In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import csv

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [3]:
lenc = LabelEncoder()

In [4]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer

In [5]:
snow = SnowballStemmer('english')
wnlem = WordNetLemmatizer()

In [6]:
class PreProcessor:
    def __init__(self, raw_list):
        self.raw_list = raw_list
        self.lowercased = None
        self.sent_tokenized = None
        self.word_tokenized = None
        self.cleaned_special_chars = None
        self.without_stopwords = None
        self.stemmed = None
        self.lemmed = None

    def lowercase(self):
        lowercased = []
        for text in self.raw_list:
            lowercased.append(text.lower())
        self.lowercased = lowercased
        return self.lowercased

    def tokenize_sentence(self):
        self.sent_tokenized = [sent_tokenize(sent) for sent in self.lowercased]
        return self.sent_tokenized
    
    def tokenize_word(self):
        self.word_tokenized = [word_tokenize(word) for word in self.lowercased]
        return self.word_tokenized

    def clean_special_chars(self):
        cleaned_text = []
        for sentence in self.sent_tokenized:
          cleaned_sent = []
          for word in sentence:
            clean = re.sub(r'[^\w\s]', "", word)
            if clean != "":
              cleaned_sent.append(clean)
          cleaned_text.append(cleaned_sent)
        self.cleaned_special_chars = cleaned_text
        return self.cleaned_special_chars


    def remove_stopwords(self):
        without_stopwords = []
        for sentences in self.cleaned_special_chars:
            no_stop = []
            for words in sentences:
              for word in words.split(" "):
                if word not in stopwords.words('english'):
                  no_stop.append(word)
            without_stopwords.append(" ".join(no_stop))
        self.without_stopwords = without_stopwords
        return self.without_stopwords
    
    def stem(self):
        stemmed_list = []
        for elem in [i for i in self.without_stopwords if type(i)!=list]:
          stemmed_list.append([snow.stem(word) for word in elem.split(" ")])
        self.stemmed = stemmed_list
        return self.stemmed
    
    def lemmatize(self):
        lemmed = []
        for item in [i for i in self.without_stopwords if type(i)!=list]:
          lemmed.append([wnlem.lemmatize(word) for word in item.split(" ")])
        self.lemmed = lemmed
        return self.lemmed

    def preprocess(self):
        self.lowercase()
        self.tokenize_sentence()
        self.tokenize_word()
        self.clean_special_chars()
        self.remove_stopwords()
        self.stem()
        self.lemmatize()


In [7]:
lyric_df = pd.read_csv('lyrics-data.csv',  usecols=[0,1,3,4], header = None, delimiter=",", quoting=csv.QUOTE_NONE, encoding='utf-8')
lyric_df.columns = ['alink', 'song', 'lyric', 'lang']
# for both, data has commas inside which is messing with delimiter, come back to this

In [None]:
lyric_df.shape

In [9]:
artist_df = pd.read_csv('artists-data.csv', usecols=[0,1,3,4,5],header = None, delimiter=",", quoting=csv.QUOTE_NONE, encoding='utf-8')
artist_df.columns = ['artist', 'songs', 'link', 'genre', 'genres']

In [None]:
artist_df.shape

In [11]:
artist_df.head
lyric_df.head

<bound method NDFrame.head of                    alink  ...                                               lang
0                  ALink  ...                                              Idiom
1        /10000-maniacs/  ...                                            ENGLISH
2        /10000-maniacs/  ...                                               baby
3        /10000-maniacs/  ...   I promise. Will the whole world be warm as th...
4        /10000-maniacs/  ...   ""O my mountain has coal veins and beds to di...
...                  ...  ...                                                ...
210258  /zeca-pagodinho/  ...                     iaiá. Você me jogou um feitiço
210259  /zeca-pagodinho/  ...                         um desejo a mais. Veja bem
210260  /zeca-pagodinho/  ...                                              palma
210261  /zeca-pagodinho/  ...   cadê a samba?. Está mangando na curimba. Está...
210262  /zeca-pagodinho/  ...                                                 "

In [None]:
# just getting english lyrics and a temporary way of removing some of the dodgy data due to delimiter issue
lyric_df = lyric_df.loc[lyric_df['lang'] == 'ENGLISH']

# making a list of genres to match up with rows of df that can then add as a column
lyrics_genres = []
for index, row in lyric_df.iterrows():
  # below may be returning a series, currently just string casting may want to handle other way later
  main_genre = artist_df.loc[artist_df['link'] == row['alink']]['genre']
  lyrics_genres.append(str(main_genre))

# something not quite right getting multiple genres for some and none for others it seems

# adding genres to lyrics
lyric_df['genre'] = lyrics_genres


In [None]:
# lyric_df.head

In [25]:
lyric_text = lyric_df['lyric']
lyric_list = lyric_text.to_list()
lyric_processor = PreProcessor(lyric_list)
lyric_processor.preprocess()

In [None]:
stemmed_lyrics = lyric_processor.stemmed
stemmed_joined = [" ".join(i) for i in stemmed_lyrics]
stemmed_series = pd.Series(stemmed_joined)
lyric_df.drop(['lyric'], axis=1)
lyric_df['lyric'] = stemmed_series
lyric_df.head

In [None]:
# now that there's genre column will encode to use as class/ifier
lyric_data_encoded = lyric_df.apply(lenc.fit_transform, axis=0)
lyric_data_encoded1 = lyric_data_encoded.loc[lyric_data_encoded['genre'] == 0]
lyric_data_encoded2 = lyric_data_encoded.loc[lyric_data_encoded['genre'] == 1]
lyric_data_encoded = pd.concat([lyric_data_encoded1, lyric_data_encoded2])
lyric_data_encoded.head
# because of genre extraction issues, there are more genres appearing than there should be, may filter out non standard, ignore etc 

In [40]:
encoded_lyric_data = lyric_data_encoded.values

In [64]:
# assigning lyrics and genre to info and class for later use
lyric_info = encoded_lyric_data[:, 2:4]
lyric_classes = encoded_lyric_data[:, -1]

In [83]:
X_train,  X_test, y_train, y_test = train_test_split(
    lyric_info, lyric_classes, test_size=0.20, random_state=75
    )

In [84]:
def get_prior_prob(y, label):
  total = y.shape[0]
  actual = np.sum(y == label)

  return total / actual


In [80]:
def conditional_prob(X_train, y_train, feature_col, feature_val, label):
  X_filtered = X_train[y_train == label]
  num = np.sum(X_filtered[:, feature_col] == feature_val)
  denom = X_filtered.shape[0]

  return num/denom


In [68]:
def predict(X_train, y_train, X_test):
  classes = np.unique(y_train)
  features = X_train.shape[1]
  
  posterior_prob = []

  for label in classes:
    chance = 1.0
    for feature in range(features):
      cond = conditional_prob(X_train, y_train, feature, X_test[feature], label)
      chance = chance * cond 
    prior = get_prior_prob(y_train, label)
    posterior = chance * prior
    posterior_prob.append(posterior)

    most_likely = np.argmax(posterior_prob)

    return most_likely

In [69]:
def get_accuracy(X_train, y_train, X_test, y_test):
  preds = []
  for i in range (X_test.shape[0]):
    pred = predict(X_train, y_train, X_test[i])
    preds.append(pred)
  class_preds = np.array(preds)
  
  accuracy = np.sum(class_preds == y_test)/ class_preds.shape[0]

  return accuracy 


In [85]:
acc = get_accuracy(X_train, y_train, X_test, y_test)

In [None]:
print(acc)