# Felipe Veloso Inferencia de tópicos con EM

## Ejercicio 1: Preparar el ambiente de trabajo

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import glob
import os

warnings.simplefilter('ignore')
plt.rcParams['figure.figsize'] = (15,12)
plt.style.use('seaborn')

In [2]:
# !ls dump/
file_list = glob.glob(os.getcwd() + '/dump/*.csv')

In [20]:
parse_csv_to_df = lambda x: pd.read_csv(x,index_col=None, header=0).drop(columns='Unnamed: 0')
df_lyrics = pd.concat([parse_csv_to_df(i) for i in file_list])
df_lyrics = ['artist','style','song','lyrics']

In [3]:
append_csv = []
for i in file_list:
    append_csv.append(
        pd.read_csv(i, index_col=None, header=0).drop(columns='Unnamed: 0'))

In [8]:
df = pd.concat(append_csv)
df.columns = ['artist','style','song','lyrics']

In [9]:
df.head()

Unnamed: 0,artist,style,song,lyrics
0,Public Enemy,hiphop,You're Gonna Get Yours,"(Flavor Flav) \n Oh-oh Chuck, they out to get ..."
1,Public Enemy,hiphop,Sophisticated Bitch,"That woman in the corner, cold playin' the rol..."
2,Public Enemy,hiphop,Miuzi Weighs A Ton,"Yo Chuck, run a power move on them \n (Yeah) \..."
3,Public Enemy,hiphop,Timebomb,"(Intro - Flavor Flav) \n Hey, Chuck, we got so..."
4,Public Enemy,hiphop,Too Much Posse,(Intro - Flavor Flav) \n What do you got to sa...


## Ejercicio 2: Matriz de ocurrencias

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
# instanciamos un objeto
count_vectorizer=CountVectorizer(stop_words='english', max_features=5000)
# Implementamos los pasos fit y transform
count_vectorizer_fit = count_vectorizer.fit_transform(df.lyrics)
# Extraemos tokens (palabras)
words = count_vectorizer.get_feature_names()
# extraemos frecuencia
words_freq = count_vectorizer_fit.toarray().sum(axis=0)

df_words = pd.DataFrame({'Word':words, 'Frec':words_freq})

top5000 = df_words.sort_values(by='Frec', ascending=False)

In [32]:
top5000

Unnamed: 0,Word,Frec
2483,like,19629
1241,don,17398
2362,know,14962
1847,got,14171
2299,just,13978
2565,love,11268
4971,yeah,11071
2519,ll,10028
2975,oh,9879
656,cause,8356


## Ejercicio 3: Entrenamiento del Modelo

In [34]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

search_params = {'n_components': [5,10,15],
                'learning_decay': [0.5,0,7]}

cv_lda_model = GridSearchCV(LatentDirichletAllocation(learning_method='online'), param_grid=search_params).fit(count_vectorizer_fit)

In [45]:
best_lda = cv_lda_model.best_estimator_
best_lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.5,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [42]:
fit_best_lda = cv_lda_model.transform(count_vectorizer_fit)

In [39]:
np.round(fit_best_lda[:10],3)

array([[0.096, 0.001, 0.087, 0.242, 0.573],
       [0.072, 0.001, 0.006, 0.333, 0.588],
       [0.156, 0.097, 0.054, 0.066, 0.628],
       [0.052, 0.261, 0.001, 0.113, 0.573],
       [0.092, 0.067, 0.002, 0.231, 0.607],
       [0.253, 0.068, 0.001, 0.434, 0.244],
       [0.08 , 0.076, 0.007, 0.145, 0.691],
       [0.156, 0.214, 0.05 , 0.066, 0.514],
       [0.038, 0.089, 0.002, 0.212, 0.659],
       [0.274, 0.088, 0.001, 0.15 , 0.488]])

In [50]:
for topic_id, topic_name in enumerate(best_lda.components_):
    print(f"Topico:  {topic_id+1}")
    concant_words_in_topic = [count_vectorizer.get_feature_names()[i] for i in topic_name.argsort()[:-15 -1 : -1]]
    print("  ".join(concant_words_in_topic))

Topico:  1
life  god  world  eyes  death  dead  die  light  blood  time  soul  black  hell  man  sun
Topico:  2
yeah  oh  hey  come  baby  got  let  rock  ba  la  like  stop  ha  mos  right
Topico:  3
ah  da  ma  em  song  jump  mama  moment  lyrics  page  future  pos  uh  able  hah
Topico:  4
love  don  know  just  ll  ve  like  oh  got  time  want  say  baby  way  make
Topico:  5
like  got  don  shit  ain  yo  man  cause  know  just  fuck  nigga  em  niggas  ya


In [54]:
fit_best_lda = best_lda.transform(count_vectorizer_fit)
topics_for_each_doc = pd.DataFrame(np.round(fit_best_lda,3),
                                  index = df.index)
topics_for_each_doc.columns = list(map(lambda x: f"Topico {x}", range(1, best_lda.n_components + 1)))

In [53]:
topics_for_each_doc.head()

Unnamed: 0,0,1,2,3,4
0,0.096,0.001,0.087,0.242,0.573
1,0.072,0.001,0.006,0.333,0.588
2,0.156,0.097,0.054,0.066,0.628
3,0.052,0.261,0.001,0.113,0.573
4,0.092,0.067,0.002,0.231,0.607


In [56]:
topics_for_each_doc.head()

Unnamed: 0,Topico 1,Topico 2,Topico 3,Topico 4,Topico 5
0,0.096,0.001,0.087,0.242,0.573
1,0.072,0.001,0.006,0.333,0.588
2,0.156,0.097,0.054,0.066,0.628
3,0.052,0.261,0.001,0.113,0.573
4,0.092,0.067,0.002,0.231,0.607


In [58]:
concatenated_df = pd.concat([df, topics_for_each_doc], axis=1)

In [60]:
concatenated_df['highest_topic'] = np.argmax(topics_for_each_doc.values, axis=1) + 1

In [64]:
concatenated_df.head(10)

Unnamed: 0,artist,style,song,lyrics,Topico 1,Topico 2,Topico 3,Topico 4,Topico 5,highest_topic
0,Public Enemy,hiphop,You're Gonna Get Yours,"(Flavor Flav) \n Oh-oh Chuck, they out to get ...",0.096,0.001,0.087,0.242,0.573,5
1,Public Enemy,hiphop,Sophisticated Bitch,"That woman in the corner, cold playin' the rol...",0.072,0.001,0.006,0.333,0.588,5
2,Public Enemy,hiphop,Miuzi Weighs A Ton,"Yo Chuck, run a power move on them \n (Yeah) \...",0.156,0.097,0.054,0.066,0.628,5
3,Public Enemy,hiphop,Timebomb,"(Intro - Flavor Flav) \n Hey, Chuck, we got so...",0.052,0.261,0.001,0.113,0.573,5
4,Public Enemy,hiphop,Too Much Posse,(Intro - Flavor Flav) \n What do you got to sa...,0.092,0.067,0.002,0.231,0.607,5
5,Public Enemy,hiphop,Rightstarter (Message To A Black Man),"Mind over matter, mouth in motion \n Can't def...",0.253,0.068,0.001,0.434,0.244,4
6,Public Enemy,hiphop,Public Enemy No. 1,"(Intro: Flavor Flav) \n Yo Chuck, bust a move ...",0.08,0.076,0.007,0.145,0.691,5
7,Public Enemy,hiphop,M.P.E.,I'm cold gettin' busy while I'm shakin' you do...,0.156,0.214,0.05,0.066,0.514,5
8,Public Enemy,hiphop,Yo! Bum Rush The Show,"I am taking no prisoners, taking no shorts \n ...",0.038,0.089,0.002,0.212,0.659,5
9,Public Enemy,hiphop,Raise The Roof,"Raise your hands, so we can \n Raise the roof,...",0.274,0.088,0.001,0.15,0.488,5


In [72]:
corr = concatenated_df.iloc['Topico 1','Topico 2','Topico 3','Topico 4','Topico 5']

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

In [70]:
corr.corr

NameError: name 'corr' is not defined