In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 5.2 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=d61e56da3cde7952e3541c15e85e29ed927e0f6776b01ed72fa80455216112d9
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [3]:
import numpy as np 
import pandas as pd
from langdetect import detect

# Genius Data Clensing

Lyrical data obtained from the web is rather dirty. Efforts are needed to clean the data


In [4]:
#NOTE: Change this to your directory as needed
DATA_DIR = "/content/drive/MyDrive/W266 Project/w266-finalproj/data"

In [5]:
# Load data
metal = pd.read_csv(f"{DATA_DIR}/genius_data/metal.csv")
rap = pd.read_csv(f"{DATA_DIR}/genius_data/rappers.csv")
rock = pd.read_csv(f"{DATA_DIR}/genius_data/rockers.csv")
jazz = pd.read_csv(f"{DATA_DIR}/genius_data/jazz.csv")
folk = pd.read_csv(f"{DATA_DIR}/genius_data/folk.csv")
pop = pd.read_csv(f"{DATA_DIR}/genius_data/pop.csv")
rb = pd.read_csv(f"{DATA_DIR}/genius_data/rb.csv")
soul = pd.read_csv(f"{DATA_DIR}/genius_data/soul.csv")

FileNotFoundError: ignored

In [None]:
# concat all dataset together into 1
frames = [metal, rap, rock, jazz, folk, pop, rb, soul]
genius_df = pd.concat(frames)
genius_df = genius_df.loc[:, ~genius_df.columns.str.contains('^Unnamed')]
genius_df.to_csv(f"{DATA_DIR}/01_raw/raw-genius-data.csv")
genius_df.head(5)

Unnamed: 0,artist,genre,title,lyrics
0,Iron Maiden,metal,The Number of the Beast,"Woe to you, o'er Earth and Sea\nFor the Devil ..."
1,Iron Maiden,metal,Fear of the Dark,I am a man who walks alone\nAnd when I'm walki...
2,Iron Maiden,metal,The Trooper,You'll take my life but I'll take yours too\nY...
3,Iron Maiden,metal,Hallowed Be Thy Name,"I'm waiting in my cold cell, when the bell beg..."
4,Iron Maiden,metal,Run to the Hills,White man came across the sea\nHe brought us p...


In [None]:
# data clensing procedure


def clean_genius(data):
    """
        Function to clean the input dataset
        -----------------------------------
        
        Parameters: 
            - data : {pd.DataFrame(), input data}
            
        Returns:
            - data : {pd.DataFrame(), cleaned output data}
    """
    genius_df = data.copy()
    #take out observations that have NaN for ‘genre’ or ‘lyrics’
    subset = genius_df[['genre','lyrics']]
    genius_df.dropna(subset=['genre', 'lyrics'], inplace=True)
    
    # replace new line with space
    # genius_df = genius_df.replace({'\n': ' '}, regex=True)
    
    #get word count
    genius_df['word_num'] = genius_df['lyrics'].str.split().str.len()
    
    #observe that there are lots of songs w/ 1 word
    genius_df.sort_values(by = "word_num").head(100)
    
    #remove entries where only 1 word in song
    genius_df['word_num'].astype('int32')
    genius_df = genius_df[genius_df.word_num != 1]
    
    #anything with rap genius comments is way to long and includes comments
    genius_df = genius_df[~genius_df['lyrics'].str.contains("RAP GENIUS")]
    
    #notice that 124/130 of the songs with "Lyrics are just some string saying how there are no lyrics"
    genius_df = genius_df[genius_df['word_num'] !=18]
    
    #anything less than 10 seems to be junk lyrics
    genius_df = genius_df[genius_df['word_num'] > 10]
    return genius_df

data = clean_genius(genius_df)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54421 entries, 0 to 4209
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   artist    54421 non-null  object
 1   genre     54421 non-null  object
 2   title     54421 non-null  object
 3   lyrics    54421 non-null  object
 4   word_num  54421 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.5+ MB


In [None]:
data.head()

Unnamed: 0,artist,genre,title,lyrics,word_num
0,Iron Maiden,metal,The Number of the Beast,"Woe to you, o'er Earth and Sea\nFor the Devil ...",303
1,Iron Maiden,metal,Fear of the Dark,I am a man who walks alone\nAnd when I'm walki...,358
2,Iron Maiden,metal,The Trooper,You'll take my life but I'll take yours too\nY...,242
3,Iron Maiden,metal,Hallowed Be Thy Name,"I'm waiting in my cold cell, when the bell beg...",261
4,Iron Maiden,metal,Run to the Hills,White man came across the sea\nHe brought us p...,199


In [None]:
# Get rid of all rows whose lyrics column is NaN
data = data[data['lyrics'].notnull()]
non_strings = 0
bad_indices = []
for index, value in data['lyrics'].items():
    if type(value) != str:
        non_strings = non_strings + 1
        bad_indices.append(index)
# print(data[type(data['lyrics']) != str].shape )
print(non_strings)
print(data.shape)
print(len(bad_indices))

0
(54421, 5)
0


In [None]:
def language_detector(string):
    global i
    try:
        res = detect(string)
    except:
        res = "undetectable"
    if i % 1000 == 0:
        print(i)
    i = i + 1
    return res

i = 0
data['language'] = data['lyrics'].apply(language_detector)
data

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000


Unnamed: 0,artist,genre,title,lyrics,word_num,language
0,Iron Maiden,metal,The Number of the Beast,"Woe to you, o'er Earth and Sea\nFor the Devil ...",303,en
1,Iron Maiden,metal,Fear of the Dark,I am a man who walks alone\nAnd when I'm walki...,358,en
2,Iron Maiden,metal,The Trooper,You'll take my life but I'll take yours too\nY...,242,en
3,Iron Maiden,metal,Hallowed Be Thy Name,"I'm waiting in my cold cell, when the bell beg...",261,en
4,Iron Maiden,metal,Run to the Hills,White man came across the sea\nHe brought us p...,199,en
...,...,...,...,...,...,...
4205,Donny Hathaway,soul,"The Ghetto - Live @ Troubadour, Hollywood, CA.","Whoo, yeah\nMm-hmm\nYes\nThis is the Ghetto\nS...",262,en
4206,Donny Hathaway,soul,What’s going on - live version,"Mother, mother\nThere's too many of you crying...",159,en
4207,Donny Hathaway,soul,Hey girl - live version,Hey girl\nI've been watching you\nThe rapid be...,161,en
4208,Donny Hathaway,soul,Make It Your Own,"Yesterday, you were mine\nNow, you're gone\nI ...",88,en


In [None]:
data = data[data['language'] == "en"]

In [None]:
data.to_csv(f"{DATA_DIR}/02_intermediate/language-processed-data.csv")