In [1]:
import pandas as pd 
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# Unicode, Regex, json for text digestion
import unicodedata
import re
import json

import datetime
# Time formatting
from time import strftime

from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

# nltk: natural language toolkit -> tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer as stemmer
# nltk.download('stopwords')
import nltk.sentiment
sia = nltk.sentiment.SentimentIntensityAnalyzer()

# Quieeet!!! Y'all can't stop me now...
import warnings
warnings.filterwarnings('ignore')

In [2]:
import draft_prepare as p

In [3]:
df = pd.read_csv('songs_0526.csv')
df.shape

(23762, 5)

### Preparing the data:

In [4]:
df = p.clean_df(df, extra_words = [], exclude_words = [])

In [5]:
df = p.model_clean(df)
df.head()

Unnamed: 0.1,Unnamed: 0,title,artist,date,lyrics,clean,stemmed,lemmatized,character_count,word_count,sentiment,decade
0,2,#1,Nelly,2001-10-20,uh uh uh got bring attention dirty better watc...,lyricsuh uh uh gotta bring attention dirty tha...,lyricsuh uh uh gotta bring attent dirti that b...,lyricsuh uh uh gotta bring attention dirty tha...,2090,391,0.9901,2000
1,4,#9 Dream,John Lennon,1974-12-21,long ago dream dream know yes know seemed real...,dream lyricsverse long ago dream dream know ye...,dream lyricsvers long ago dream dream know ye ...,dream lyricsverse long ago dream dream know ye...,864,147,0.9313,1970
2,5,#Beautiful,Mariah Carey Featuring Miguel,2013-05-25,ah ah beautiful ah ah beautiful hop back bike ...,beautiful lyricsintro mariah carey ah ah youre...,beauti lyricsintro mariah carey ah ah your bea...,beautiful lyricsintro mariah carey ah ah youre...,967,178,0.9981,2010
3,6,#SELFIE,The Chainsmokers,2014-03-15,jason table kept seeing look girl think make j...,selfie lyricsverse jason table kept seeing loo...,selfi lyricsvers jason tabl kept see look girl...,selfie lyricsverse jason table kept seeing loo...,970,184,0.8228,2010
4,7,#thatPOWER,will.i.am Featuring Justin Bieber,2013-04-06,oh alive alive alive oh fly fly fly oh alive a...,thatpower lyricsinstrumental break prechorus j...,thatpow lyricsinstrument break prechoru justin...,thatpower lyricsinstrumental break prechorus j...,1554,277,0.9978,2010


In [39]:
# What song has the lowest sentiment?
df.sort_values(by = ['sentiment'], ascending = True).head(5)

Unnamed: 0.1,Unnamed: 0,title,artist,date,lyrics,clean,stemmed,lemmatized,character_count,word_count,sentiment,decade
15813,18638,Red Opps,21 Savage,2017-01-07,yeah yeah yeah pull roll window pull roll wind...,red opps lyricsintro yeah yeah yeah pull roll ...,red opp lyricsintro yeah yeah yeah pull roll w...,red opps lyricsintro yeah yeah yeah pull roll ...,1919,359,-0.9999,2010
19942,23589,The War Song,Culture Club,1984-10-06,war war stupid people stupid love mean nothing...,war song lyricschorus war war stupid people st...,war song lyricschoru war war stupid peopl stup...,war song lyricschorus war war stupid people st...,1638,297,-0.9999,1980
1453,1724,BBO (Bad Bitches Only),Migos Featuring 21 Savage,2018-02-10,buddah bless beat ap iced tennis chain iced wh...,bbo bad bitches lyricsintro buddah bless beat ...,bbo bad bitch lyricsintro buddah bless beat ch...,bbo bad bitch lyricsintro buddah bless beat ch...,2728,512,-0.9999,2010
12666,14957,Make It Nasty,Tyga,2012-03-10,make nasty make nasty drop drop bitch make nas...,make nasty lyricschorus make nasty make nasty ...,make nasti lyricschoru make nasti make nasti d...,make nasty lyricschorus make nasty make nasty ...,1663,313,-0.9998,2010
14866,17521,Outta Your Mind,Lil Jon Featuring LMFAO,2010-07-17,yeaaaaah everybody club right standin around n...,outta mind lyricsintro lil jon yeaaaaah everyb...,outta mind lyricsintro lil jon yeaaaaah everyb...,outta mind lyricsintro lil jon yeaaaaah everyb...,2384,459,-0.9998,2010


In [38]:
# What song has the highest sentiment?
df.sort_values(by = ['sentiment'], ascending = False).head(5)

Unnamed: 0.1,Unnamed: 0,title,artist,date,lyrics,clean,stemmed,lemmatized,character_count,word_count,sentiment,decade
6973,8204,Got To Love Somebody,Sister Sledge,1980-01-19,got love somebody today got love somebody got ...,got love somebody lyricschorus ive got love so...,got love somebodi lyricschoru ive got love som...,got love somebody lyricschorus ive got love so...,2824,535,0.9999,1980
12321,14528,Love Me Good,Michael W. Smith,1998-04-25,sometimes feel like world one big gigantic mer...,love good lyricssometimes feel like world one ...,love good lyricssometim feel like world one bi...,love good lyricssometimes feel like world one ...,1368,264,0.9999,1990
12485,14722,Lovely Day,Bill Withers,1977-12-10,wake morning love sunlight hurt eye something ...,lovely day lyricsverse wake morning love sunli...,love day lyricsvers wake morn love sunlight hu...,lovely day lyricsverse wake morning love sunli...,1317,276,0.9999,1970
4354,5148,Do I Do,Stevie Wonder,1982-05-29,see street whole body get weak standing crowd ...,lyricsverse see street whole body gets weak yo...,lyricsvers see street whole bodi get weak your...,lyricsverse see street whole body get weak you...,2090,397,0.9999,1980
8368,9851,I Can Love You,Mary J. Blige,1997-07-26,love love love better love love love better si...,love lyricschorus mary j blige love love love ...,love lyricschoru mari j blige love love love l...,love lyricschorus mary j blige love love love ...,1640,313,0.9999,1990


## Topic Modeling

#### Latent Dirichlet Allocation:

In [6]:
lemma_lyrics = df['lemmatized']
lemma_lyrics.shape[0]

22210

In [7]:
# Initialize variables
n_samples = 10_000
n_features = 2_500
n_topics = 10

# Set up dataset to be fit
dataset = lemma_lyrics
# # 
# data_samples = dataset.data[:n_samples]

# Use tf feature for LDA model
tf_vectorizer  = CountVectorizer(max_df = 1.0, min_df = 1, 
                                 max_features = n_features)
tf = tf_vectorizer.fit_transform(dataset)
# Set up LDA
lda = LatentDirichletAllocation(n_components = n_topics, max_iter = 10, random_state = 42)

lda

LatentDirichletAllocation(random_state=42)

### Other methods using Scikit-Learn

In [8]:
X_train, X_hold = train_test_split(df, test_size = .3, random_state = 42)
X_train.shape[0], X_hold.shape[0]

(15547, 6663)

In [9]:
X_train.head(1)

Unnamed: 0.1,Unnamed: 0,title,artist,date,lyrics,clean,stemmed,lemmatized,character_count,word_count,sentiment,decade
119,153,(The Lights Went Out In) Massachusetts,Bee Gees,1967-11-11,feel going back massachusetts something tellin...,lights went massachusetts lyricsfeel im goin b...,light went massachusett lyricsfeel im goin bac...,light went massachusetts lyricsfeel im goin ba...,332,49,0.0,1960


In [10]:
def tokenize(text):
    tokens = [word for word in nltk.word_tokenize(text) if (len(word) > 3 and len(word.strip('Xx/'))>2)]
#     stems = [stemmer.stem(letter) for letter in tokens]
    return tokens

In [11]:
tf_vectorizer = TfidfVectorizer(tokenizer = tokenize, stop_words = 'english', 
                                max_df = .75, min_df = 50, max_features = 2_500)
tf = tf_vectorizer.fit_transform(X_train.lemmatized)

In [12]:
tf.A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Sentiment