# EXISTING: Data Analysis
- Will be making 3 data frames: Male Rapper Lyrics, Female Rapper Lyrics, and All Lyrics


In [1]:
# import libraries
import pandas as pd
import os, json
from glob import glob

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Trying out DataFrames with Small Dataset
- I used the `Lyrics_2Pac.json` to do this inital exploration
- The following code chunks deal with creating this initial dataframe `pac_lyrics_df` with the 2Pac information
- It includes finding out what types of object get formed in the process

In [2]:
# Trying out doing a data frame with one of the male artists: "2Pac"
m_path = './rap_lyrics/male_lyrics/'    # path with the male artists
# Pass the JSON into a dictionary
with open(m_path+'Lyrics_2Pac.json') as json_file:
    data = json.load(json_file)
# confirming that it is a dictionary type object   
type(data)    # It is!
# print out the keys to see where the song lyrics may be held
data.keys()   # It should be in 'songs'

dict

dict_keys(['alternate_names', 'api_path', 'description', 'facebook_name', 'followers_count', 'header_image_url', 'id', 'image_url', 'instagram_name', 'is_meme_verified', 'is_verified', 'name', 'translation_artist', 'twitter_name', 'url', 'current_user_metadata', 'description_annotation', 'user', 'songs'])

In [3]:
# build a data frame from dictionary using pd.DataFrame.from_dict
pac_lyrics_df = pd.DataFrame.from_dict(data['songs'])
# data frame should have the columns 'artist', 'title' (for song title), and 'lyrics'
pac_lyrics_df = pac_lyrics_df[['artist', 'title', 'lyrics']]
# did it populate correctly?
pac_lyrics_df.head() # it did!

Unnamed: 0,artist,title,lyrics
0,2Pac,16 on Death Row,Death Row\nThat's where mothafuckas is endin' ...
1,2Pac,1995 Police Station Testimony,"Woman – Sir, will you raise your right hand, p..."
2,2Pac,1 for April,2 me your name alone is poetry\nI barely know ...
3,2Pac,1st impression,Just when I thought I'd seen it all\nour paths...
4,2Pac,1st Impressions: 4 Irene,Just when I thought I'd seen it all\nour paths...


## Now let's try it with the full male artist directory
- The following code chunk populates the official male lyrics dataframe (`mlyrics_df`) with the data from the 10 males artists:
    - J.Cole, Jay-Z, Kanye West, The Notorious B.I.G., Kendrick Lamar, Lil Wayne, Snoop Dogg, Nas, Drake, 2Pac
    - This code uses the code from the previous chunks

In [77]:
# empty df for male lyrics with the column titles: artist, title, lyrics
mlyrics_df = pd.DataFrame(columns=['artist', 'title', 'lyrics'])

# for loop to populate mlyrics_df
for filename in [file for file in os.listdir(m_path) if file.endswith('.json')]:
    # prints out a list of the filenames in the directory
    print(filename)
    # read in each filename and load it
    with open(m_path + filename) as json_file:
        data = json.load(json_file)
        # populate a temp_df with necessary info
        temp_df = pd.DataFrame.from_dict(data['songs'])
        temp_df = temp_df[['artist', 'title', 'lyrics']]
        # concatenate temp_df to mlyrics_df
        mlyrics_df = pd.concat([mlyrics_df,temp_df])
        # change indexing from 0-99 for each artist to 0-999 for whole dataset
        mlyrics_df = mlyrics_df.reset_index(drop=True)
mlyrics_df

# making sure the data and the data frame are the correct types
type(data)
type(mlyrics_df)
# these are the right data types!

Lyrics_J.Cole.json
Lyrics_JAYZ.json
Lyrics_KanyeWest.json
Lyrics_TheNotoriousB.I.G..json
Lyrics_KendrickLamar.json
Lyrics_LilWayne.json
Lyrics_SnoopDogg.json
Lyrics_Nas.json
Lyrics_Drake.json
Lyrics_2Pac.json


Unnamed: 0,artist,title,lyrics
0,J. Cole,03' Adolescence,La la la\nLa la la la la\nLa la la\nLa la la l...
1,J. Cole,102.1 Jamz Freestyle,For all y’all boys cheap talking\nKeep walking...
2,J. Cole,102 Jamz Freestyle,For all y’all boys cheap talking\nKeep walking...
3,J. Cole,1985,"1985, I arrived\n33 years, damn, I'm grateful ..."
4,J. Cole,2012,"Yes, straight out the Ville and I'm blessed\nN..."
...,...,...,...
995,2Pac,Flex,"Flex, flex flex\nFlex, flex flex\n\nSlippin' t..."
996,2Pac,Forever And Today,U say that u'll love me forever but what about...
997,2Pac,For Mrs. Hawkins (In Memory of Yusef Hawkins),This poem is addressed 2 Mrs. Hawkins\nwho los...
998,2Pac,Fortune & Fame,"And my niggas say, we want the fame!\nC'mon\n\..."


dict

pandas.core.frame.DataFrame

In [22]:
# 10 artists, 1000 values, 979 unique titles, 993 unique lyrics
    # so there are some duplicates with the titles and the lyrics
mlyrics_df.describe()
# Lil Wayne is the most common artist...why?...all artists have the same number of songs...
# Anything is the most common song title

Unnamed: 0,artist,title,lyrics
count,1000,1000,1000.0
unique,10,979,993.0
top,Lil Wayne,Anything,
freq,100,3,4.0


In [23]:
# 1000 for each thing column. Good!
mlyrics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  1000 non-null   object
 1   title   1000 non-null   object
 2   lyrics  1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


In [24]:
# Every artist has their 100 lyrics as expected
mlyrics_df.artist.value_counts()

Lil Wayne               100
Nas                     100
JAY-Z                   100
Snoop Dogg              100
Kendrick Lamar          100
Kanye West              100
The Notorious B.I.G.    100
J. Cole                 100
2Pac                    100
Drake                   100
Name: artist, dtype: int64

**Observations**:
- There are some duplicates with the titles (**979** vs **1000**) / lyrics (**993** vs **1000**)
- Each artist had `0-99` index and to make things less confusing when calling specific rows, I changed the indexing with `mlyrics_df = mlyrics_df.reset_index(drop=True)`
- All of the value counts for each column checked out (**1000** entries for all)
- Every male artist has **100 lyrics/titles** as expected
- I think I would start cleaning at the song title level to get the duplicates through sampling the data.

## Now let's try it with the full female directory
- The following code chunk populates the official female lyrics dataframe (`flyrics_df`) with the data from the following 10 female artists:
    - Rico Nasty, Missy Elliott, Lil Kim, Cardi B, Remy Ma, Rapsody, Trina, Nicki Minaj, Queen Latifah
    - *Note.* There are less rows in this data frame than the `mlyrics_df` because not all artists reached the 100 songs specified in the data collection code (e.g. Remy Ma [86], Cardi B [76]) 
        - This was expected because Cardi B is a fairly new artist and Remy Ma was incarcerated for a long time and is just now getting back to music.

In [21]:
f_path = './rap_lyrics/female_lyrics/'

# empty df for male lyrics with the column titles: artist, title, lyrics
flyrics_df = pd.DataFrame(columns=['artist', 'title', 'lyrics'])

# for loop to populate mlyrics_df
for filename in [file for file in os.listdir(f_path) if file.endswith('.json')]:
    # prints out a list of the filenames in the directory
    print(filename)
    # read in each filename and load it
    with open(f_path + filename) as json_file:
        data = json.load(json_file)
        # populate a temp_df with necessary info
        temp_df = pd.DataFrame.from_dict(data['songs'])
        temp_df = temp_df[['artist', 'title', 'lyrics']]
        # concatenate temp_df to mlyrics_df
        flyrics_df = pd.concat([flyrics_df,temp_df])
        # change indexing from 0-99 for each artist to 0-999 for whole dataset
        flyrics_df = flyrics_df.reset_index(drop=True)
flyrics_df

# making sure the data and the data frame are the correct types
type(data)
type(flyrics_df)
# these are the right data types!

Lyrics_RicoNasty.json
Lyrics_MissyElliott.json
Lyrics_MeganTheeStallion.json
Lyrics_LilKim.json
Lyrics_CardiB.json
Lyrics_RemyMa.json
Lyrics_Rapsody.json
Lyrics_Trina.json
Lyrics_NickiMinaj.json
Lyrics_QueenLatifah.json


Unnamed: 0,artist,title,lyrics
0,Rico Nasty,10Fo,"Smoov, what's good, baby? (Woo)\nWake up F1LTH..."
1,Rico Nasty,Animal,"I'm a bear, you a mother fuckin' reindeer\nWhe..."
2,Rico Nasty,Ar-15,Pointing red lasers on you\nDo you need a head...
3,Rico Nasty,Arenas,Yeah-yeah-yeah-yeah\n\nI can't wait till I sel...
4,Rico Nasty,Back & Forth,CashMoneyAP\n\nI said I'm back in this bitch\n...
...,...,...,...
957,Queen Latifah,The World,"The world, oh, oh, oh, the world\nThe world, o..."
958,Queen Latifah,Trav’lin’ Light,I'm trav'lin' light\nBecause my man has gone\n...
959,Queen Latifah,Turn You On,Did I make you hot? Tell me\nI didn't mean to ...
960,Queen Latifah,U.N.I.T.Y.,"Uh, U.N.I.T.Y., U.N.I.T.Y. that's a unity\nU.N..."


dict

pandas.core.frame.DataFrame

In [9]:
# 10 artists, 962 values, 950 unique song titles, 952 unique song lyrics
    # there are dupicates here
# Nicki Minaj is the most common artist
# Crazy is the most common song title...must look into this...
flyrics_df.describe()

Unnamed: 0,artist,title,lyrics
count,962,962,962.0
unique,10,950,952.0
top,Trina,Crazy,
freq,100,3,11.0


In [10]:
# 962 values in each column as expected!
    # REMEMBER: Cardi B (76), Remy Ma (86)
flyrics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 962 entries, 0 to 99
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  962 non-null    object
 1   title   962 non-null    object
 2   lyrics  962 non-null    object
dtypes: object(3)
memory usage: 30.1+ KB


In [11]:
# all values are as expected 
flyrics_df.artist.value_counts()

Trina                  100
Megan Thee Stallion    100
Missy Elliott          100
Queen Latifah          100
Lil’ Kim               100
Rapsody                100
Rico Nasty             100
Nicki Minaj            100
Remy Ma                 86
Cardi B                 76
Name: artist, dtype: int64

## There are some issues...let's clean
- I would like to take out any rows that have titles that have Skit or Interlude in them
    - Will explore the data frames and see what problems are 
    - I'm seeing some duplicates that I will need to delete
    - Also some newline characters (definitely delete) and name headers that show that they are other artists featured in the work but not dictated in the song titles that I may/may not remove
        - My initial reaction is that I may not need to but my conclusions will have to be based upon the overall songs rather than the artists.

### CLEANING `mlyrics_df` 

In [67]:
mlyrics_df.head()
mlyrics_df.tail()

Unnamed: 0,artist,title,lyrics
0,J. Cole,03' Adolescence,La la la\nLa la la la la\nLa la la\nLa la la l...
1,J. Cole,102.1 Jamz Freestyle,For all y’all boys cheap talking\nKeep walking...
2,J. Cole,102 Jamz Freestyle,For all y’all boys cheap talking\nKeep walking...
3,J. Cole,1985,"1985, I arrived\n33 years, damn, I'm grateful ..."
4,J. Cole,2012,"Yes, straight out the Ville and I'm blessed\nN..."


Unnamed: 0,artist,title,lyrics
995,2Pac,Flex,"Flex, flex flex\nFlex, flex flex\n\nSlippin' t..."
996,2Pac,Forever And Today,U say that u'll love me forever but what about...
997,2Pac,For Mrs. Hawkins (In Memory of Yusef Hawkins),This poem is addressed 2 Mrs. Hawkins\nwho los...
998,2Pac,Fortune & Fame,"And my niggas say, we want the fame!\nC'mon\n\..."
999,2Pac,Friends,"I want to be, yo, let me fuck that nigga down\..."


In [70]:
# There are some quotation marks that aren't needed
# newline characters should be removed

mlyrics_df.sample(10)

# rows with (Davey D) need thrown out from 2Pac; these are interviews
# Blackface Response from Drake needs thrown out...not a song
# A3pac rap from 2Pac needs thrown out just a link for lyrics
# DJ Khaled's Son by Kanye needs scrapped. Just a short snippet

Unnamed: 0,artist,title,lyrics
922,2Pac,Am I Next,They wanna bury me I'm worried\nI'm losin' my ...
522,Lil Wayne,Action,Go\nI'm in love with a beauty\nShe is a cutie\...
664,Snoop Dogg,Brake Fluid (Biiittch Pump Yo Brakes),Woman: Ya baby\nSnoop: Ya playa\nWoman: Ya bab...
431,Kendrick Lamar,Black Friday,Dick hard like rottweiler\nCan you handle it? ...
627,Snoop Dogg,Around The World,"Around the world, around the world\nAround the..."
328,The Notorious B.I.G.,Fuck Me (Interlude),(Jodeci song playing in the background and sou...
310,The Notorious B.I.G.,Biggie Smalls Got That Hype Shit,"""Well, who is 'Biggie Smalls'?""\n\nNotorious B..."
898,Drake,Extra Special,Yeah. Ho!\nIt's Drake. Uh Uh. Ho!\nDo that dan...
481,Kendrick Lamar,For the Girlfriends,Abazaba\nYou know we get in the studio and do ...
543,Lil Wayne,A Message to the DJs,Yo yo public service anouncment man\nIt's your...


In [36]:
# Let's check out what's up with Anything since it was the most frequent
mlyrics_df[mlyrics_df.title=='Anything']
# JAY-Z, Kanye and Lil Wayne all have a song called 'Anything'

Unnamed: 0,artist,title,lyrics
132,JAY-Z,Anything,"Uh huh yea, yeah\nDuro!\nYou gotta let it bump..."
226,Kanye West,Anything,I mean wow. You know? Man\n\nLookin' out my lo...
546,Lil Wayne,Anything,"I'd risk everything\nFor one kiss, everything\..."


In [53]:
# So there are some song titles with weird titles I saw through sampling
mlyrics_df[mlyrics_df.title=='.']
mlyrics_df[mlyrics_df.title=='E']

Unnamed: 0,artist,title,lyrics
200,Kanye West,.,.
800,Drake,.,.


Unnamed: 0,artist,title,lyrics
199,JAY-Z,E,E


In [78]:
# drop 904, 982, 908, 840, 294, 1, 200, 800, 199, 187, 297, 797, 175, 64, 328 
mlyrics_df = mlyrics_df.drop([904,982,908,840,294,1,200,800,199,187,297,797,175,64,328])
mlyrics_df

Unnamed: 0,artist,title,lyrics
0,J. Cole,03' Adolescence,La la la\nLa la la la la\nLa la la\nLa la la l...
2,J. Cole,102 Jamz Freestyle,For all y’all boys cheap talking\nKeep walking...
3,J. Cole,1985,"1985, I arrived\n33 years, damn, I'm grateful ..."
4,J. Cole,2012,"Yes, straight out the Ville and I'm blessed\nN..."
5,J. Cole,2Face,"Hey, I got a dollar and a dream\nIt's all a ni..."
...,...,...,...
995,2Pac,Flex,"Flex, flex flex\nFlex, flex flex\n\nSlippin' t..."
996,2Pac,Forever And Today,U say that u'll love me forever but what about...
997,2Pac,For Mrs. Hawkins (In Memory of Yusef Hawkins),This poem is addressed 2 Mrs. Hawkins\nwho los...
998,2Pac,Fortune & Fame,"And my niggas say, we want the fame!\nC'mon\n\..."


In [83]:
# sanity check for duplicats in the 'lyrics' column
boolean = mlyrics_df.duplicated(subset=['lyrics']).any()
boolean # there are duplicates as expected

# drop_duplicates on the lyrics column
mlyrics_df = mlyrics_df.drop_duplicates(subset=['lyrics']) 
mlyrics_df

False

Unnamed: 0,artist,title,lyrics
0,J. Cole,03' Adolescence,La la la\nLa la la la la\nLa la la\nLa la la l...
2,J. Cole,102 Jamz Freestyle,For all y’all boys cheap talking\nKeep walking...
3,J. Cole,1985,"1985, I arrived\n33 years, damn, I'm grateful ..."
4,J. Cole,2012,"Yes, straight out the Ville and I'm blessed\nN..."
5,J. Cole,2Face,"Hey, I got a dollar and a dream\nIt's all a ni..."
...,...,...,...
995,2Pac,Flex,"Flex, flex flex\nFlex, flex flex\n\nSlippin' t..."
996,2Pac,Forever And Today,U say that u'll love me forever but what about...
997,2Pac,For Mrs. Hawkins (In Memory of Yusef Hawkins),This poem is addressed 2 Mrs. Hawkins\nwho los...
998,2Pac,Fortune & Fame,"And my niggas say, we want the fame!\nC'mon\n\..."


In [84]:
# 10 artists, 982 values, 963 unique titles, 982 unique lyrics
mlyrics_df.describe()
# duplicates and specific rows have been dropped

Unnamed: 0,artist,title,lyrics
count,982,982,982.0
unique,10,963,982.0
top,Lil Wayne,Anything,
freq,100,3,1.0


In [85]:
# let's check the distribution of data for each artist now
mlyrics_df.artist.value_counts()

# Lil Wayne is only artist with 100
# Nas, Snoop Dogg, Kendrick Lamar, The Notorious B.I.G. with 99
# J. Cole with 98
# JAY-Z, Kanye West, 2Pac, Drake with 97

Lil Wayne               100
Nas                      99
Snoop Dogg               99
Kendrick Lamar           99
The Notorious B.I.G.     99
J. Cole                  98
JAY-Z                    97
Kanye West               97
2Pac                     97
Drake                    97
Name: artist, dtype: int64

*I am hesitant to drop duplicates for song titles because there could be repeat title names for different artists and I would be dropping out this data unnecessarily. I think I will have to go through each artist individually and note which ones to drop in particular.*
>this might be the best way to go because there are some non-lyrical things in there that I will need to filter out as well.

**Observations with cleaning `mlyrics_df`**:
- Through sampling the dataframe, I was able to find several problems
    - certain rows were not actual songs 
        - lyrics were empty, contained a link, contained just a random character (e.g. '.'), there were also interviews or short snippets of songs
        - I verified snippet/interview from looking up the 'song' on Genius.com
    - there were duplicate lyrics (similar titles)
        - These rows have been dropped 
- `mlyrics_df` now contains the following data set up:
    - 10 artists, 982 values, 963 unique titles, 982 unique lyrics
    - **Lil Wayne** is only artist with 100
    - **Nas, Snoop Dogg, Kendrick Lamar, The Notorious B.I.G.** with 99  
    - **J. Cole** with 98 **JAY-Z, Kanye West, 2Pac, Drake** with 97
- The data is a little imbalanced but nothing to call home about.
- I still need to take out punctuation (except dashes because they can be crucial to the lyrical content) and newline characters

### CLEANING `flyrics_df` 

In [87]:
flyrics_df.head()
flyrics_df.tail()

Unnamed: 0,artist,title,lyrics
0,Rico Nasty,10Fo,"Smoov, what's good, baby? (Woo)\nWake up F1LTH..."
1,Rico Nasty,Animal,"I'm a bear, you a mother fuckin' reindeer\nWhe..."
2,Rico Nasty,Ar-15,Pointing red lasers on you\nDo you need a head...
3,Rico Nasty,Arenas,Yeah-yeah-yeah-yeah\n\nI can't wait till I sel...
4,Rico Nasty,Back & Forth,CashMoneyAP\n\nI said I'm back in this bitch\n...


Unnamed: 0,artist,title,lyrics
957,Queen Latifah,The World,"The world, oh, oh, oh, the world\nThe world, o..."
958,Queen Latifah,Trav’lin’ Light,I'm trav'lin' light\nBecause my man has gone\n...
959,Queen Latifah,Turn You On,Did I make you hot? Tell me\nI didn't mean to ...
960,Queen Latifah,U.N.I.T.Y.,"Uh, U.N.I.T.Y., U.N.I.T.Y. that's a unity\nU.N..."
961,Queen Latifah,Walk the Dinosaur (From ”Ice Age: Dawn of the ...,Boom boom acka-lacka lacka boom\nBoom boom ack...


In [112]:
# there are some section headers with curly brackets '{}'
# there is also puncuation that I won't need
# some clean versions made it through
# freestyles have a lot of parenthetical data like section headers etc.
flyrics_df.sample(10)

Unnamed: 0,artist,title,lyrics
364,Lil’ Kim,I’m Dat Bitch,Where my real bitches at?\nOnly Queen Bee can ...
200,Megan Thee Stallion,-,Lyrics already made
305,Lil’ Kim,Aunt Dot,My Aunt Dot\nLeft a Glock and some blood on my...
330,Lil’ Kim,Do What You Like,"QB, ya shit, shit is crazy, yo\nCan't fuck wit..."
935,Queen Latifah,No/Yes,Its been too long since you kiss\nMe do you wa...
229,Megan Thee Stallion,Freak Nasty,"Liljumadethisbeat\nFreak, freak, freak nasty (..."
665,Trina,Ain’t gggggggggg,"Niggas ain't shit, but hoes and tricks\nLick t..."
21,Rico Nasty,Dennis Rodman,Oh my God\nYeah-yeah-yeah\n\nTen different col...
97,Rico Nasty,Smack a Bitch,Yeah!\nYeah! (Rico)\nKenny Beats\nSugar Trap\n...
1,Rico Nasty,Animal,"I'm a bear, you a mother fuckin' reindeer\nWhe..."


In [115]:
flyrics_df[flyrics_df.title.str.contains('Clean')]

Unnamed: 0,artist,title,lyrics
414,Cardi B,Clean it Up,:\nMu-mu-murda\nBardi\nWoohoo\n:\nI thought al...
461,Cardi B,Thru Your Phone (Clean),Look\nI just want to break up all your ____\nC...
469,Cardi B,WAP (Clean),****** in this house\nThere's some ****** in t...
838,Nicki Minaj,Fly (Super Clean Edit),"I came to win, to fight, to conquer, to thrive..."


In [116]:
# drop 38, 303, 500, 532, 807, 779, 794, 424, 768, 59, 461, 638, 
# 369, 418, 401, 424, 367, 200, 249, 368, 514, 744, 119, 128, 700, 461, 469, 838 

flyrics_df = flyrics_df.drop([38,303,500,532,807,779,794,424,768,59,461,638,369,418,
                              401,424,367,200,249,368,514,744,119,128,700,461,469,838])
flyrics_df

Unnamed: 0,artist,title,lyrics
0,Rico Nasty,10Fo,"Smoov, what's good, baby? (Woo)\nWake up F1LTH..."
1,Rico Nasty,Animal,"I'm a bear, you a mother fuckin' reindeer\nWhe..."
2,Rico Nasty,Ar-15,Pointing red lasers on you\nDo you need a head...
3,Rico Nasty,Arenas,Yeah-yeah-yeah-yeah\n\nI can't wait till I sel...
4,Rico Nasty,Back & Forth,CashMoneyAP\n\nI said I'm back in this bitch\n...
...,...,...,...
957,Queen Latifah,The World,"The world, oh, oh, oh, the world\nThe world, o..."
958,Queen Latifah,Trav’lin’ Light,I'm trav'lin' light\nBecause my man has gone\n...
959,Queen Latifah,Turn You On,Did I make you hot? Tell me\nI didn't mean to ...
960,Queen Latifah,U.N.I.T.Y.,"Uh, U.N.I.T.Y., U.N.I.T.Y. that's a unity\nU.N..."


In [117]:
# sanity check for duplicats in the 'lyrics' column
boolean = flyrics_df.duplicated(subset=['lyrics']).any()
boolean # there are duplicates as expected

# drop_duplicates on the lyrics column
flyrics_df = flyrics_df.drop_duplicates(subset=['lyrics']) 
flyrics_df

True

Unnamed: 0,artist,title,lyrics
0,Rico Nasty,10Fo,"Smoov, what's good, baby? (Woo)\nWake up F1LTH..."
1,Rico Nasty,Animal,"I'm a bear, you a mother fuckin' reindeer\nWhe..."
2,Rico Nasty,Ar-15,Pointing red lasers on you\nDo you need a head...
3,Rico Nasty,Arenas,Yeah-yeah-yeah-yeah\n\nI can't wait till I sel...
4,Rico Nasty,Back & Forth,CashMoneyAP\n\nI said I'm back in this bitch\n...
...,...,...,...
957,Queen Latifah,The World,"The world, oh, oh, oh, the world\nThe world, o..."
958,Queen Latifah,Trav’lin’ Light,I'm trav'lin' light\nBecause my man has gone\n...
959,Queen Latifah,Turn You On,Did I make you hot? Tell me\nI didn't mean to ...
960,Queen Latifah,U.N.I.T.Y.,"Uh, U.N.I.T.Y., U.N.I.T.Y. that's a unity\nU.N..."


In [118]:
# 10 artists, 927 values, 916 unique titles, 927 unique lyrics
flyrics_df.describe()
# duplicates and specific rows have been dropped

Unnamed: 0,artist,title,lyrics
count,927,927,927.0
unique,10,916,927.0
top,Queen Latifah,Crazy,
freq,99,3,1.0


In [119]:
# let's check the distribution of data for each artist now
flyrics_df.artist.value_counts()

# No artists with 100
# Queen Latifah with 99
# Trina, Megan Thee Stallion, Missy Elliott, Rico Nasty with 98
# Lil' Kim with 96
# Nicki Minaj with 95
# Rapsody with 93
# Remy Ma with 82
# Cardi B with 70

Queen Latifah          99
Trina                  98
Megan Thee Stallion    98
Missy Elliott          98
Rico Nasty             98
Lil’ Kim               96
Nicki Minaj            95
Rapsody                93
Remy Ma                82
Cardi B                70
Name: artist, dtype: int64

**Observations with cleaning `flyrics_df`**:
- Through sampling the data frame, I was able to find several problems
    - certain rows were not songs or were songs with featured artists
        - there were rows with no lyrics
        - there were songs that were skits
        - there were really short song snippets
        - I verified snippet/interview from looking up the 'song' on Genius.com
    - there were duplicate lyrics (similar titles)
        - These rows have been dropped 
- `flyrics_df` now contains the following data set up:
    - 10 artists, 927 values, 916 unique titles, 927 unique lyrics
    - No artists with 100
        - **Queen Latifah** with 99
        - **Trina, Megan Thee Stallion, Missy Elliott, Rico Nasty** with 98
        - **Lil' Kim** with 96
        - **Nicki Minaj** with 95
        - **Rapsody** with 93
        - **Remy Ma** with 82
        - **Cardi B** with 70
- The data is a little imbalanced but nothing to call home about. It was already imbalanced
    - it is especially imbalanced when comparing it to `mlyrics_df`
- I still need to take out punctuation (except dashes because they can be crucial to the lyrical content) and newline characters

*I acknowledge that all of the songs taken from these artists are not all (fully) Rap songs (Queen Latifah, Missy Elliott, Nicki Miniaj)*
>    There isn't much I can do about that at this point. I will just have to be careful about generalizing the data in certain ways. This just happens to be a limitation for the way I scraped the data in the first place

## Let's combine `mlyrics_df` and `flyrics_df`
- this is for analytical/ modeling purposes

In [120]:
# will run the following code when both mlyrics_df and flyrics_df have been cleaned
all_lyrics_df = pd.concat([mlyrics_df, flyrics_df])
all_lyrics_df

Unnamed: 0,artist,title,lyrics
0,J. Cole,03' Adolescence,La la la\nLa la la la la\nLa la la\nLa la la l...
2,J. Cole,102 Jamz Freestyle,For all y’all boys cheap talking\nKeep walking...
3,J. Cole,1985,"1985, I arrived\n33 years, damn, I'm grateful ..."
4,J. Cole,2012,"Yes, straight out the Ville and I'm blessed\nN..."
5,J. Cole,2Face,"Hey, I got a dollar and a dream\nIt's all a ni..."
...,...,...,...
957,Queen Latifah,The World,"The world, oh, oh, oh, the world\nThe world, o..."
958,Queen Latifah,Trav’lin’ Light,I'm trav'lin' light\nBecause my man has gone\n...
959,Queen Latifah,Turn You On,Did I make you hot? Tell me\nI didn't mean to ...
960,Queen Latifah,U.N.I.T.Y.,"Uh, U.N.I.T.Y., U.N.I.T.Y. that's a unity\nU.N..."


In [121]:
all_lyrics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1909 entries, 0 to 961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  1909 non-null   object
 1   title   1909 non-null   object
 2   lyrics  1909 non-null   object
dtypes: object(3)
memory usage: 59.7+ KB


In [122]:
all_lyrics_df.describe()

Unnamed: 0,artist,title,lyrics
count,1909,1909,1909.0
unique,20,1857,1907.0
top,Lil Wayne,Black Friday,
freq,100,3,2.0


In [123]:
# let's check the distribution of data for each artist now
all_lyrics_df.artist.value_counts()

Lil Wayne               100
The Notorious B.I.G.     99
Kendrick Lamar           99
Queen Latifah            99
Snoop Dogg               99
Nas                      99
Megan Thee Stallion      98
J. Cole                  98
Rico Nasty               98
Trina                    98
Missy Elliott            98
Drake                    97
Kanye West               97
2Pac                     97
JAY-Z                    97
Lil’ Kim                 96
Nicki Minaj              95
Rapsody                  93
Remy Ma                  82
Cardi B                  70
Name: artist, dtype: int64