In [1]:
import pandas as pd 
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# Unicode, Regex, json for text digestion
import unicodedata
import re
import json

import datetime
# Time formatting
from time import strftime

from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

# nltk: natural language toolkit -> tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer as stemmer
# nltk.download('stopwords')
import nltk.sentiment
sia = nltk.sentiment.SentimentIntensityAnalyzer()

# Quieeet!!! Y'all can't stop me now...
import warnings
warnings.filterwarnings('ignore')

In [2]:
import draft_prepare as p

In [3]:
df = pd.read_csv('songs_0526.csv')
df.shape

(23762, 5)

### Preparing the data:

In [4]:
# df = p.clean_df(df, extra_words = [], exclude_words = [])

In [5]:
df = p.model_clean(df)
df.head()

Unnamed: 0.1,Unnamed: 0,title,artist,date,lyrics,decade
0,2,#1,Nelly,2001-10-20,uh uh uh got bring attention dirty better watc...,2000
1,4,#9 Dream,John Lennon,1974-12-21,long ago dream dream know yes know seemed real...,1970
2,5,#Beautiful,Mariah Carey Featuring Miguel,2013-05-25,ah ah beautiful ah ah beautiful hop back bike ...,2010
3,6,#SELFIE,The Chainsmokers,2014-03-15,jason table kept seeing look girl think make j...,2010
4,7,#thatPOWER,will.i.am Featuring Justin Bieber,2013-04-06,oh alive alive alive oh fly fly fly oh alive a...,2010


In [36]:
# # What song has the lowest sentiment?
# (df.sort_values(by = ['sentiment'], ascending = True).head(5))

In [None]:
# # What song has the highest sentiment?
# df.sort_values(by = ['sentiment'], ascending = False).head(5)

In [None]:
# # Average sentiment per decade?
# df.groupby(['decade'])['sentiment'].mean()

## Topic Modeling

#### Latent Dirichlet Allocation:

In [18]:
# Create an instance
cv = CountVectorizer(max_df = .95, min_df = 2, stop_words = 'english')

# Fit and transform the lemmatized lyrics data
cv_fit = cv.fit_transform(df.lyrics)

print('\nShape of the sparse matrix\n')
cv_fit


Shape of the sparse matrix



<22210x24547 sparse matrix of type '<class 'numpy.int64'>'
	with 1263109 stored elements in Compressed Sparse Row format>

In [37]:
# Create the instance for LDA
lda = LatentDirichletAllocation(n_components = 20, random_state = 42)

# Fit the vectorizer with the LDA
lda.fit(cv_fit)

print('Number of topics:', len(lda.components_))
print('Number of columns of the LDA fit', len(lda.components_[0]))

Number of topics: 20
Number of columns of the LDA fit 24547


In [38]:
feature = cv.get_feature_names()

print('Length of feature names:', len(feature))

Length of feature names: 24547


In [39]:
for ind, topic in enumerate(lda.components_):
    print('Top 50 words in topic {}'.format(ind))
    print('-'*25)
    top_50 = topic.argsort()[-50:]
    print([feature[i] for i in top_50], '\n\n')

Top 50 words in topic 0
-------------------------
['understand', 'hurt', 'treat', 'man', 'good', 'yeah', 'mean', 'di', 'hand', 'bad', 'talk', 'look', 'inside', 'believe', 'try', 'boy', 'heart', 'hold', 'oh', 'friend', 'care', 'mind', 'time', 'feeling', 'lie', 'true', 'right', 'girl', 'come', 'better', 'fool', 'baby', 'touch', 'thing', 'real', 'let', 'somebody', 'make', 'think', 'way', 'really', 'got', 'like', 'say', 'love', 'feel', 'tell', 'need', 'want', 'know'] 


Top 50 words in topic 1
-------------------------
['easy', 'live', 'dream', 'word', 'start', 'feeling', 'goodbye', 'darling', 'wait', 'eye', 'need', 'kiss', 'lose', 'away', 'arm', 'come', 'think', 'hurt', 'believe', 'fall', 'hard', 'long', 'mind', 'thing', 'want', 'change', 'loving', 'break', 'true', 'feel', 'hold', 'like', 'day', 'oh', 'try', 'forever', 'baby', 'life', 'stay', 'right', 'got', 'say', 'going', 'let', 'way', 'know', 'make', 'time', 'heart', 'love'] 


Top 50 words in topic 2
-------------------------
['loving

In [40]:
import copy
# Final df transforming cv_fit
df_final = lda.transform(cv_fit)

# Make copy to save original df 
df_new = copy.deepcopy(df)

In [41]:
print('\nChecking the probability distribution of one text data belonging to the topic.\n')

print('Few words from 1st row:', df.lyrics[0][:88], '\n')

print('Probability distribution:', df_final[0])



Checking the probability distribution of one text data belonging to the topic.

Few words from 1st row: uh uh uh got bring attention dirty better watch talkin bout runnin mouth like know gon f 

Probability distribution: [1.68918923e-04 1.68918923e-04 1.36420460e-01 2.90691474e-01
 1.68918922e-04 3.55899462e-02 1.68918922e-04 2.81193071e-01
 1.68918923e-04 1.68918923e-04 1.68918923e-04 1.68918923e-04
 5.21967981e-02 2.22907729e-02 6.18119775e-02 1.68918920e-04
 1.17778473e-01 1.68918924e-04 1.68918923e-04 1.68918924e-04]


In [34]:
prob = df_final[0][df_final[0].argmax()].round(2)

print('Document belong to the topic', df_final[0].argmax(), 'with the probability of', prob)

Document belong to the topic 3 with the probability of 0.39


In [35]:
df['topic'] = df_final.argmax(axis = 1)

df_new.head()

Unnamed: 0.1,Unnamed: 0,title,artist,date,lyrics,decade,topic
0,2,#1,Nelly,2001-10-20,uh uh uh got bring attention dirty better watc...,2000,3
1,4,#9 Dream,John Lennon,1974-12-21,long ago dream dream know yes know seemed real...,1970,2
2,5,#Beautiful,Mariah Carey Featuring Miguel,2013-05-25,ah ah beautiful ah ah beautiful hop back bike ...,2010,2
3,6,#SELFIE,The Chainsmokers,2014-03-15,jason table kept seeing look girl think make j...,2010,3
4,7,#thatPOWER,will.i.am Featuring Justin Bieber,2013-04-06,oh alive alive alive oh fly fly fly oh alive a...,2010,1


In [17]:
# creating a dictionary with key as topic numbers and value as topic names
topic_label = {0:, 1:, 2:, 3:, 4:, }

# mapping the dictionary with the dataframe to get the labels.
df_new['topic_name'] = df_new['topic'].map(topic_label)

# head of the dataframe
df_new.head(3)

SyntaxError: invalid syntax (2791015135.py, line 2)

### Other methods using Scikit-Learn

### Sentiment