In [9]:
import pandas as pd

def clean(sentence):
    return sentence.strip()

def remove_tags(sentence):
    regex = compile(r'<\/?[\w-]+>')
    return regex.sub('', sentence)

def remove_dash(sentence):
    regex = compile(r'-')
    return regex.sub('', sentence)

def get_word2idx(tokenized_sentences):
    word2idx ={}
    for sentence in tokenized_sentences:
        for word in sentence:
            if word not in word2idx:
                word2idx[word] = len(word2idx) + 1
    word2idx['UNK'] = len(word2idx)
    return word2idx

def get_index(word, word2idx):
    return word2idx[word] if word in word2idx else word2idx['UNK']

def get_embeddings(word2vec, word2idx, dim=300):
    embeddings = zeros((len(word2idx), dim))
    for (word, i) in word2idx.items():
        if word != 'UNK' and word in word2vec.index2word:
            embeddings[i] = word2vec.word_vec(word)
        else:
            pass
    return embeddings

def get_training_data(tokenized_sentences, classes, classes2, word2idx, maxlen=1000, class_num=4, class_num_two=4):
    train_x = array([[get_index(word, word2idx) for word in sentence] for sentence in tokenized_sentences])
    train_y = [cls for cls in classes]
    train_z = [cls for cls in classes2]

    train_x = pad_sequences(train_x, maxlen=maxlen, dtype='int32', padding='post', truncating='pre', value=0.)
    train_y = array([[1 if i == cls else 0 for i in range(0, class_num)] for cls in train_y])
    train_z = array([[1 if i == cls else 0 for i in range(0, class_num_two)] for cls in train_z])

    return (train_x, train_y, train_z)

def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):
    r = {}
    with open(filename, 'rb') as f:
        r = pickle.load(f)
    return r

In [10]:
from sklearn.model_selection import train_test_split
mood = []
genre = []


def get_corpus(path):
    global mood, genre
    corpus = read_csv(path)
    corpus = corpus[['sentence', 'mood', 'genre']]
    corpus['sentence'] = corpus['sentence'].apply(clean).apply(remove_tags)
    corpus['tokenized_sentence'] = corpus['sentence'].apply(word_tokenize, engine='deepcut')
    corpus['genre'] = corpus['genre'].apply(remove_dash)
    mood += list(corpus.mood.unique())
    genre += list(corpus.genre.unique())
    return corpus

corpus = get_corpus('NLP-Corpus_yak.csv')
train_corpus = corpus.copy()
test_corpus = corpus.sample(n=7,replace=True).copy()


mood = sorted(list(set(mood)), reverse=True)
genre = sorted(list(set(genre)), reverse=True)
mood2idx = dict(zip(mood, [i for i in range(len(mood))]))
genre2idx = dict(zip(genre, [i for i in range(len(genre))]))

def pos_process(corpus):
    global mood2idx, genre2idx
    corpus['genre'] = corpus['genre'].apply(lambda x: genre2idx[x])
    corpus['mood'] = corpus['mood'].apply(lambda x: mood2idx[x])
    return corpus

train_corpus = pos_process(train_corpus)
test_corpus = pos_process(test_corpus)
corpus = pos_process(corpus)

print(mood, genre, corpus)

NameError: name 'read_csv' is not defined