In [55]:
"""Manejo de Informacion"""

import pandas as pd
from pandas import json_normalize
import requests
import json

"""Tiempo"""

from datetime import datetime
from datetime import timezone

"""Textos"""

import re 
from unidecode import unidecode
import nltk
from nltk.probability import FreqDist

"""Visualizaciones"""

import matplotlib.pyplot as plt
import seaborn as sns

"""ML"""

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

pd.options.mode.chained_assignment = None 

# Datos

In [8]:
df1 = pd.read_csv("top_david_spotify.csv",usecols = ["name","lyrics"]).dropna().reset_index(drop = True)
df2 = pd.read_csv("top_javier_spotify.csv",usecols = ["name","lyrics"]).dropna().reset_index(drop = True)

# Limpiar datos

In [50]:
def CSV_transform(df):
    """
    Función que transforma de formato CSV a diccionario
    """
    
    df = df[~(df["lyrics"] == "error")] # No tomar en cuenta canciones que no tienen letra
    
    df["lyrics"] = df.lyrics.str.replace("\r"," ").str.replace("\n"," ").str.replace("[:;,.¡!¿?\(\)\-\"\"0-9]","").str.lower() # Quitar espacios, interlineados, reemplazar algunos signos/numeros y pasar a minúsculas.
    
    df["lyrics"] = df.lyrics.apply(lambda x: unidecode(x)) # Quitar unicodes de la forma \uxxxx
    
    df["lyrics"] = df.lyrics.apply(lambda x: " ".join(x.split())) # Strippear el texto (quitar espacios innecesarios)
    
    #df["tokens"] = df["lyrics"].apply(lambda x: set(nltk.word_tokenize(x))) # Tokenizar las canciones 
    return df

# Naive Bayes Classifier para encontrar idioma

Entrenar algorítmo de clasificación para clasificar entre 4 lenguas: Español, Inglés, Francés y Portugués. Se utilizará el algorítmo visto en clase, pero implementado por Sklearn.

[El conjunto de datos etiquetado](https://www.kaggle.com/datasets/basilb2s/language-detection) fue extraido de Kaggle para facilitar el etiquetado. 

1) English
2) Malayalam
3) Hindi
4) Tamil
5) Kannada
6) French
7) Spanish
8) Portuguese
9) Italian
10) Russian
11) Sweedish
12) Dutch
13) Arabic
14) Turkish
15) German
16) Danish
17) Greek

__NOTA__: Dado que la longitud de las canciones no es tan extensa, no se aplicará ningún tipo de stemming. Tampoco considero necesario aplicar la técnica de los bigramas. 



In [56]:
lenguajes = pd.read_csv("Language Detection.csv")

In [58]:
vectorizer = CountVectorizer()
lenguajes.Text.to_list()

[' Nature, in the broadest sense, is the natural, physical, material world or universe.',
 '"Nature" can refer to the phenomena of the physical world, and also to life in general.',
 'The study of nature is a large, if not the only, part of science.',
 'Although humans are part of nature, human activity is often understood as a separate category from other natural phenomena.',
 '[1] The word nature is borrowed from the Old French nature and is derived from the Latin word natura, or "essential qualities, innate disposition", and in ancient times, literally meant "birth".',
 '[2] In ancient philosophy, natura is mostly used as the Latin translation of the Greek word physis (φύσις), which originally related to the intrinsic characteristics that plants, animals, and other features of the world develop of their own accord.',
 '[3][4] \nThe concept of nature as a whole, the physical universe, is one of several expansions of the original notion;[1] it began with certain core applications of t

### Conjuntos de datos

---

In [51]:
CSV_transform(df2)

Unnamed: 0,name,lyrics
0,Mai,paroles de la chanson mai par videoclub [matth...
1,Roi,t'en trouvera d'autres des mecs comme moi il y...
2,Take It Or Leave It,paroles de la chanson take it or leave it par ...
3,The Modern Age,up on a hill here's where we begin this little...
4,Procura,procura seducirme muy despacio y no reparo tod...
5,The End Has No End,paroles de la chanson the end has no end par t...
6,Bonita - Remix,dayme & el high this is the remix ella se tard...
7,Ruby,let it never be said that romance is dead cos'...
8,Stop The World I Wanna Get Off With You,open sesame we've places to go we've people to...
9,summer depression,depression teenage suicide why girls emotional...


In [48]:
CSV_transform(df2).tokens[1]

{'a',
 'abimes',
 'aime',
 'ame',
 'au',
 'aura',
 'avec',
 'bas',
 'beige',
 'bouche',
 'bouches',
 'boucles',
 'bout',
 'brunes',
 'ces',
 'cherche',
 'coeur',
 'comme',
 'corps',
 "d'autres",
 'dans',
 'de',
 'des',
 'deteste',
 'disparates',
 'divague',
 'ecarlate',
 'ecorchant',
 'en',
 'ensevelis',
 'errant',
 'es',
 'et',
 'femme',
 'file',
 'filles',
 'fixe',
 'fleurs',
 'fumee',
 'fuyant',
 'garcon',
 'gars',
 'gens',
 'gout',
 'gresillent',
 'iconique',
 'il',
 "j'erre",
 "j'me",
 "j'parcours",
 'je',
 'jours',
 "l'aube",
 "l'esprit",
 'la',
 'laisse',
 'larmes',
 'le',
 'les',
 'leur',
 'leurs',
 'levres',
 'lointains',
 'lyrique',
 'ma',
 'mais',
 'me',
 'mecs',
 'mes',
 'mievre',
 'moi',
 'mon',
 "n'aime",
 "n'est",
 'neige',
 'nuages',
 'nuit',
 'nymphe',
 'ombre',
 'ou',
 'parcours',
 'peau',
 'pensees',
 'plane',
 'plein',
 'pleut',
 'pluie',
 'point',
 'pour',
 'quand',
 'que',
 'qui',
 'regardes',
 'resilles',
 'reste',
 'reveille',
 'reves',
 'rires',
 'roi',
 'rose'

In [47]:
CSV_transform(df2).lyrics[5]

"paroles de la chanson the end has no end par the strokes one by one ticking time bombs won it's not the secret of the government that's keeping you dumb oh it's the other way around wait what's that sound one by one baby here they come he wants it easy he want it relaxed said i can do alot of things but i can't do that two steps forward then three steps back alright won't you take a walk outside oh no can't you find some other guy oh no what's that sound oh no keeping down the underground oh no the end has no end the end has no end the end has no end the end has no end he want it easy he want it relaxed said i can do alot of things but i can't do that two steps forward then three steps back it won't be easy won't you take a walk outside oh no can't you find another guy oh no what's that sound oh no keeping down the underground oh no the end has no end the end has no end the end has no end the end has no end the end has no end the end has no end the end has no end the end has no end"

In [None]:
songs["tokens"] = songs["lyrics"].apply(lambda x: nltk.word_tokenize(x))

---

In [26]:
df1 = df1[~(df1["lyrics"] == "error")]

df1["lyrics"] = df1.lyrics.str.replace("\r"," ").str.replace("\n"," ").str.replace("[:;,.¡!¿?\(\)\-\"\"]","").str.lower()


df1["lyrics"] = df1.lyrics.apply(lambda x: unidecode(x))

df1["lyrics"] = df1.lyrics.apply(lambda x: " ".join(x.split()))