In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import csv

In [3]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
lenc = LabelEncoder()

In [5]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer

In [6]:
snow = SnowballStemmer('english')
wnlem = WordNetLemmatizer()

In [7]:
class PreProcessor:
    def __init__(self, raw_list):
        self.raw_list = raw_list
        self.lowercased = None
        self.sent_tokenized = None
        self.word_tokenized = None
        self.cleaned_special_chars = None
        self.without_stopwords = None
        self.stemmed = None
        self.lemmed = None

    def lowercase(self):
        lowercased = []
        for text in self.raw_list:
            lowercased.append(text.lower())
        self.lowercased = lowercased
        return self.lowercased

    def tokenize_sentence(self):
        self.sent_tokenized = [sent_tokenize(sent) for sent in self.lowercased]
        return self.sent_tokenized
    
    def tokenize_word(self):
        self.word_tokenized = [word_tokenize(word) for word in self.lowercased]
        return self.word_tokenized

    def clean_special_chars(self):
        cleaned_text = []
        for sentence in self.sent_tokenized:
          cleaned_sent = []
          for word in sentence:
            clean = re.sub(r'[^\w\s]', "", word)
            if clean != "":
              cleaned_sent.append(clean)
          cleaned_text.append(cleaned_sent)
        self.cleaned_special_chars = cleaned_text
        return self.cleaned_special_chars


    def remove_stopwords(self):
        without_stopwords = []
        for sentences in self.cleaned_special_chars:
            no_stop = []
            for words in sentences:
              for word in words.split(" "):
                if word not in stopwords.words('english'):
                  no_stop.append(word)
            without_stopwords.append(" ".join(no_stop))
        self.without_stopwords = without_stopwords
        return self.without_stopwords
    
    def stem(self):
        stemmed_list = []
        for elem in [i for i in self.without_stopwords if type(i)!=list]:
          stemmed_list.append([snow.stem(word) for word in elem.split(" ")])
        self.stemmed = stemmed_list
        return self.stemmed
    
    def lemmatize(self):
        lemmed = []
        for item in [i for i in self.without_stopwords if type(i)!=list]:
          lemmed.append([wnlem.lemmatize(word) for word in item.split(" ")])
        self.lemmed = lemmed
        return self.lemmed

    def preprocess(self):
        self.lowercase()
        self.tokenize_sentence()
        self.tokenize_word()
        self.clean_special_chars()
        self.remove_stopwords()
        self.stem()
        self.lemmatize()


In [8]:
lyric_df = pd.read_csv('traincommasnewlinesremoved.csv',  usecols=range(0,5), header = None, delimiter=",", quoting=csv.QUOTE_NONE, 
                       encoding='utf-8')
lyric_df.columns = ['artist', 'song', 'genre', 'lang', 'lyrics']

In [None]:
lyric_df.shape
lyric_df.head

In [10]:
# starting to standardise the column numbers here, 
#will get the longest lyrics (pink floyd - the wall, it seems) and pad the rest with zeros to be same length
lyric_df['lyrics']=lyric_df['lyrics'].fillna("")
lyrics = lyric_df['lyrics'].tolist()
max_lyric_len = max([i.split(" ") for i in lyrics], key=len)

In [11]:
word_col_num = len(max_lyric_len) #longest lyric 

In [None]:
padded_split_lyrics = []
# uses too much ram going thru in one go
for index, row in lyric_df.iloc[:100000].iterrows():
  padded_list = [0] * word_col_num 
  list_of_lyrics = row['lyrics'].split(" ")
  padded_list[:len(list_of_lyrics)] = list_of_lyrics
  padded_split_lyrics.append(padded_list)
for index, row in lyric_df.iloc[100000:200000].iterrows():
  padded_list = [0] * word_col_num 
  list_of_lyrics = row['lyrics'].split(" ")
  padded_list[:len(list_of_lyrics)] = list_of_lyrics
  padded_split_lyrics.append(padded_list)
# print(padded_split_lyrics[10]) 

In [72]:
# lyric_df2 = pd.read_csv('testcommasnewlinesremoved.csv', usecols=[0,2,3,4],header = None, delimiter=",", quoting=csv.QUOTE_NONE, encoding='utf-8')
# lyric_df2.columns = ['Song', 'Artist', 'Genre', 'Lyrics']
# the dataset is already split into files intended for train and test, may initially combine and train test split using sklearn
# just working with first df for now, data also formatted slightly differently

In [73]:
lyric_df = lyric_df.astype(str)

In [None]:
# lyric_text = lyric_df['lyrics']
# lyric_list = lyric_text.to_list()
# lyric_processor = PreProcessor(lyric_list)
# lyric_processor.preprocess()
# may return to preprocessing, atm may not be that useful

In [None]:
# stemmed_lyrics = lyric_processor.stemmed
# stemmed_joined = [" ".join(i) for i in stemmed_lyrics]
# stemmed_series = pd.Series(stemmed_joined)
# lyric_df.drop(['lyrics'], axis=1)
# lyric_df['lyrics'] = stemmed_series

In [90]:
# now that there's genre column will encode to use as class/ifier
lyric_data_encoded = lyric_df.apply(lenc.fit_transform, axis=0)
lyric_df['genre'] = lenc.fit_transform(lyric_df['genre'])
lyric_df.genre.values


array([ 8, 15, 15, ..., 13, 10, 13])

In [91]:
encoded_lyric_data = lyric_data_encoded.values
# leaving out for now i think

In [122]:
# assigning lyrics and genre to info and class for later use, lyrics will become everything past the other columns rather than a single one
lyric_info = lyric_df.iloc[:, 4:5]
lyric_classes = lyric_df.iloc[:, 2]

In [123]:
X_train,  X_test, y_train, y_test = train_test_split(
    lyric_info, lyric_classes, test_size=0.20, random_state=75
    )

In [None]:
X_train

In [33]:
def get_prior_prob(y, label):
  total = y.shape[0]
  actual = np.sum(y == label)

  return total / actual


In [34]:
def conditional_prob(X_train, y_train, feature_col, feature_val, label):
  X_filtered = X_train[y_train == label]
  num = np.sum(X_filtered[:, feature_col] == feature_val)
  denom = X_filtered.shape[0]

  return num/denom


In [35]:
def predict(X_train, y_train, X_test):
  classes = np.unique(y_train)
  features = X_train.shape[1]
  
  posterior_prob = []

  for label in classes:
    chance = 1.0
    for feature in range(features):
      cond = conditional_prob(X_train, y_train, feature, X_test[feature], label)
      chance = chance * cond 
    prior = get_prior_prob(y_train, label)
    posterior = chance * prior
    posterior_prob.append(posterior)

    most_likely = np.argmax(posterior_prob)

    return most_likely

In [127]:
def get_accuracy(X_train, y_train, X_test, y_test):
  preds = []
  for i in range (X_test.shape[0]):
    pred = predict(X_train, y_train, X_test[i])
    preds.append(pred)
  class_preds = np.array(preds)
  
  accuracy = np.sum(class_preds == y_test)/ class_preds.shape[0]

  return accuracy 


In [None]:
acc = get_accuracy(X_train, y_train, X_test, y_test)

In [None]:
print(acc)