<a href="https://colab.research.google.com/github/rbqpark/tinger/blob/main/RomanceLyricsAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Uses a Random Forest model to classify song lyrics as romance/non-romance**

In [None]:
# import Beautiful Soup, NumPy and Pandas, etc
import numpy as np
import pandas as pd
import re
import hashlib
 
# download NLTK classifiers - these are cached locally on your machine
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# import ml classifiers
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer     # parsing/stemmer
from nltk.tag import pos_tag            # parts-of-speech tagging
from nltk.corpus import wordnet         # sentiment scores
from nltk.stem import WordNetLemmatizer # stem and context
from nltk.corpus import stopwords       # stopwords
from nltk.util import ngrams            # ngram iterator

# import word2vec
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec

# import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Import English dataset - first 100 is for training, the rest are unlabelled
link = 'https://drive.google.com/file/d/1xfCgqZg0LTq9IKIEP5Ckm3B0Lid5EYLX/view?usp=sharing'
id = '1xfCgqZg0LTq9IKIEP5Ckm3B0Lid5EYLX'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv')  
english_playlist = pd.read_csv('Filename.csv')[['Song Title', 'Lyrics', 'Romance']]
english_playlist_train = pd.read_csv('Filename.csv')[['Song Title', 'Lyrics', 'Romance']].head(100)

# Import Mandarin dataset  - first 100 is for training, the rest are unlabelled
link = 'https://drive.google.com/file/d/1U3m4k6HvKgS2S48S7Zwt8qZH5oxdpnC4/view?usp=sharing'
id = '1U3m4k6HvKgS2S48S7Zwt8qZH5oxdpnC4'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv') 
mandarin_playlist_train = pd.read_csv('Filename.csv')[['Song Title','Translated Lyrics ', 'Romance']]
mandarin_playlist = mandarin_playlist.rename(columns = {'Translated Lyrics ':'Lyrics'})
mandarin_playlist_train = mandarin_playlist.rename(columns = {'Translated Lyrics ':'Lyrics'}).head(100)

# Join two datasets into one
frames = [english_playlist_train, mandarin_playlist_train]
data = pd.concat(frames, ignore_index = True)
data = data.dropna()
data = data.astype({'Romance': 'int64'})


full_frames = [english_playlist, mandarin_playlist]
full_data = pd.concat(full_frames, ignore_index = True)
full_data = full_data.dropna()
full_data = full_data.astype({'Romance': 'int64'})


In [None]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))


def lyrics_cleaner(lyrics, lemmatize=True, stem=False):

    if lemmatize == True and stem == True:
        raise RuntimeError("May not pass both lemmatize and stem flags")

    # Remove punctuation
    lyrics = re.sub(r'[:"".!?\\-]', '', lyrics)

    # Tokenize into words (all lower case)
    lyrics = lyrics.lower().split()

    # Remove stopwords, Lemmatize, Stem
    filtered_sentence = [wnl.lemmatize(w) for w in lyrics if not w in eng_stopwords]
    stemmed_sentence = []
    for w in filtered_sentence:
        w = ps.stem(w)
        stemmed_sentence.append(w)

    # Join the lyrics to one sentence
    lyrics_processed = ' '.join(stemmed_sentence)
    
    return lyrics_processed

In [None]:
def get_vectorizer(ngram, max_features):
    return CountVectorizer(ngram_range=(1, ngram),
                             analyzer = "word",
                             preprocessor = lyrics_cleaner,
                             max_features = max_features)

# Model training
def train_predict_sentiment(lyrics, vectorizer, y = data["Romance"], ngram=1, max_features=1000, model_random_state=0):

    print("Creating the model!\n")
    
    # train / test split
    X_train, X_test, y_train, y_test = train_test_split(lyrics, y, random_state = 0, test_size = .2)

    # Then we use fit_transform() to fit the model / learn the vocabulary,
    # then transform the data into feature vectors.
    # The input should be a list of strings. .toarray() converts to a numpy array
    
    train_bag = vectorizer.fit_transform(X_train)
    if not isinstance(train_bag, np.ndarray):
        train_bag = train_bag.toarray()

    test_bag = vectorizer.transform(X_test)
    if not isinstance(test_bag, np.ndarray):
        test_bag = test_bag.toarray()

    print("Training the random forest classifier!\n")
    # Initialize a Random Forest classifier with 50 trees
    forest = RandomForestClassifier(n_estimators = 50, random_state = model_random_state) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the target variable
    forest = forest.fit(train_bag, y_train)

    # predict
    train_predictions = forest.predict(train_bag)
    test_predictions = forest.predict(test_bag)
    
    # validation
    train_acc = metrics.accuracy_score(y_train, train_predictions)
    valid_acc = metrics.accuracy_score(y_test, test_predictions)
    
    print(" The training accuracy is: ", train_acc, "\n", "The validation accuracy is: ", valid_acc)
    print()
    print('CONFUSION MATRIX:')
    print('         Predicted')
    print('          neg pos')
    print(' Actual')
    c=confusion_matrix(y_test, test_predictions)
    print('     neg  ',c[0])
    print('     pos  ',c[1])

    return forest

# Print out the top features
def top_features(forest, vectorizer, n):
    #Extract feature importance
    print('\nTOP TEN IMPORTANT FEATURES:')
    feature_text = vectorizer.get_feature_names().copy()
    feature_importance = forest.feature_importances_.copy()
    
    indices = np.argsort(feature_importance)[::-1]
    
    top_n_ind = indices[:n]
    top_n = list([vectorizer.get_feature_names()[ind] for ind in top_n_ind])
    
    return top_n

# Print out whether the prediction is accurate
def check_prediction(model, vectorizer, review, expected):
    prediction = model.predict(vectorizer.transform([review]))[0]
    sentiment = "\n👍" if prediction else "\n👎"
    correct = "\x1b[92mcorrect\x1b[0m" if prediction == expected else "\x1b[31mincorrect\x1b[0m"
    print("{} ⟶ {} {}".format(review, sentiment, correct))

In [None]:
vectorizer = get_vectorizer(1, 100)
forest_model = train_predict_sentiment(data["Lyrics"], vectorizer, max_features=100)
top_10 = top_features(forest_model, vectorizer, 10)
print(top_10)

Creating the model!

Training the random forest classifier!

 The training accuracy is:  1.0 
 The validation accuracy is:  0.7

CONFUSION MATRIX:
         Predicted
          neg pos
 Actual
     neg   [11  9]
     pos   [ 3 17]

TOP TEN IMPORTANT FEATURES:
['love', 'like', 'face', 'peopl', 'world', 'go', 'way', 'want', 'light', 'miss']


In [None]:
data_vectorized = vectorizer.transform(full_data['Lyrics'])
if not isinstance(data_vectorized, np.ndarray):
        data_vectorized = data_vectorized.toarray()
full_predictions = forest_model.predict(data_vectorized)
pd.DataFrame(full_predictions, full_data['Song Title'])

Unnamed: 0_level_0,0
Song Title,Unnamed: 1_level_1
We Are Young,0
Save Your Tears,1
Blinding Lights,1
Rain on me,0
Intentions,1
...,...
水调歌头,0
淒美地,1
你是如此难以忘记,1
致姍姍來遲的你,0


In [None]:
romantic_song = "I found a love for me Oh darling, just dive right in and follow my lead Well, I found a girl, beautiful and sweet Oh, I never knew you were the someone waiting for me 'Cause we were just kids when we fell in love Not knowing what it was I will not give you up this time But darling, just kiss me slow, your heart is all I own And in your eyes, you're holding mine Baby, I'm dancing in the dark with you between my arms Barefoot on the grass, listening to our favourite song When you said you looked a mess, I whispered underneath my breath But you heard it, darling, you look perfect tonight Well I found a woman, stronger than anyone I know She shares my dreams, I hope that someday I'll share her home I found a love, to carry more than just my secrets To carry love, to carry children of our own We are still kids, but we're so in love Fighting against all odds I know we'll be alright this time Darling, just hold my hand Be my girl, I'll be your man I see my future in your eyes Baby, I'm dancing in the dark, with you between my arms Barefoot on the grass, listening to our favorite song When I saw you in that dress, looking so beautiful I don't deserve this, darling, you look perfect tonight Baby, I'm dancing in the dark, with you between my arms Barefoot on the grass, listening to our favorite song I have faith in what I see Now I know I have met an angel in person And she looks perfect I don't deserve this You look perfect tonight"
check_prediction(forest_model, vectorizer, romantic_song, 1)

I found a love for me Oh darling, just dive right in and follow my lead Well, I found a girl, beautiful and sweet Oh, I never knew you were the someone waiting for me 'Cause we were just kids when we fell in love Not knowing what it was I will not give you up this time But darling, just kiss me slow, your heart is all I own And in your eyes, you're holding mine Baby, I'm dancing in the dark with you between my arms Barefoot on the grass, listening to our favourite song When you said you looked a mess, I whispered underneath my breath But you heard it, darling, you look perfect tonight Well I found a woman, stronger than anyone I know She shares my dreams, I hope that someday I'll share her home I found a love, to carry more than just my secrets To carry love, to carry children of our own We are still kids, but we're so in love Fighting against all odds I know we'll be alright this time Darling, just hold my hand Be my girl, I'll be your man I see my future in your eyes Baby, I'm dancin

In [None]:
non_romantic_song = "Feeling my way through the darkness Guided by a beating heart I can't tell where the journey will end But I know where to start They tell me I'm too young to understand They say I'm caught up in a dream Well life will pass me by if I don't open up my eyes Well that's fine by me So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost I tried carrying the weight of the world But I only have two hands Hope I get the chance to travel the world But I don't have any plans Wish that I could stay forever this young Not afraid to close my eyes Life's a game made for everyone And love is a prize So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost I didn't know I was lost I didn't know I was lost I didn't know I was lost I didn't know, I didn't know, I didn't know"
check_prediction(forest_model, vectorizer, non_romantic_song, 0)

Feeling my way through the darkness Guided by a beating heart I can't tell where the journey will end But I know where to start They tell me I'm too young to understand They say I'm caught up in a dream Well life will pass me by if I don't open up my eyes Well that's fine by me So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost I tried carrying the weight of the world But I only have two hands Hope I get the chance to travel the world But I don't have any plans Wish that I could stay forever this young Not afraid to close my eyes Life's a game made for everyone And love is a prize So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost So wake me up when it's all over When I'm wiser and I'm older All this time I was fin

In [None]:
w2v_model = Word2Vec(sentences=[utils.simple_preprocess(lyrics) for lyrics in data['Lyrics']], size=100, workers=1)

In [None]:
def get_avg_feature_vecs(reviews, model):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one 

    
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    
    reviewFeatureVecs = []
    # Loop through the reviews
    for counter, review in enumerate(reviews):
        
        # Print a status message every 5000th review
        if (counter + 1) % 5000. == 0.:
            print("Review %d of %d" % (counter + 1, len(reviews)))

        # Function to average all of the word vectors in a given paragraph
        featureVec = []

        # Loop over each word in the review and, if it is in the model's
        # vocaublary, add its feature vector to the total        
        for n,word in enumerate(utils.simple_preprocess(review)):
            if word in index2word_set: 
                featureVec.append(model.wv[word])

        
        
        # Average the word vectors
        featureVec = np.mean(featureVec, axis=0).reshape(1,-1)

        reviewFeatureVecs.append(featureVec)

    return np.concatenate(reviewFeatureVecs, axis=0)

w2v_vectorizer = FunctionTransformer(lambda x: get_avg_feature_vecs(x, w2v_model))

In [None]:
w2v_forest_model = train_predict_sentiment(data['Lyrics'], w2v_vectorizer, max_features=100)

Creating the model!

Training the random forest classifier!

 The training accuracy is:  1.0 
 The validation accuracy is:  0.525

CONFUSION MATRIX:
         Predicted
          neg pos
 Actual
     neg   [11  9]
     pos   [10 10]


In [None]:
check_prediction(w2v_forest_model, vectorizer, non_romantic_song, 0)

Feeling my way through the darkness Guided by a beating heart I can't tell where the journey will end But I know where to start They tell me I'm too young to understand They say I'm caught up in a dream Well life will pass me by if I don't open up my eyes Well that's fine by me So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost I tried carrying the weight of the world But I only have two hands Hope I get the chance to travel the world But I don't have any plans Wish that I could stay forever this young Not afraid to close my eyes Life's a game made for everyone And love is a prize So wake me up when it's all over When I'm wiser and I'm older All this time I was finding myself And I didn't know I was lost So wake me up when it's all over When I'm wiser and I'm older All this time I was fin

In [None]:
check_prediction(w2v_forest_model, vectorizer, romantic_song, 1)

I found a love for me Oh darling, just dive right in and follow my lead Well, I found a girl, beautiful and sweet Oh, I never knew you were the someone waiting for me 'Cause we were just kids when we fell in love Not knowing what it was I will not give you up this time But darling, just kiss me slow, your heart is all I own And in your eyes, you're holding mine Baby, I'm dancing in the dark with you between my arms Barefoot on the grass, listening to our favourite song When you said you looked a mess, I whispered underneath my breath But you heard it, darling, you look perfect tonight Well I found a woman, stronger than anyone I know She shares my dreams, I hope that someday I'll share her home I found a love, to carry more than just my secrets To carry love, to carry children of our own We are still kids, but we're so in love Fighting against all odds I know we'll be alright this time Darling, just hold my hand Be my girl, I'll be your man I see my future in your eyes Baby, I'm dancin