In [78]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

pd.options.display.max_rows = 999

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Nirvana'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [21]:
table = soup.find_all('table', class_ = 'wikitable sortable plainrowheaders')[0]

column_names = ['track_name', 'writers', 'original_release', 'producers', 'release_year']

songs = []
writers = []
original = []
producers = []
year = []

rows = table.find_all('tr')

# SONGS

for row in rows[1:]:    
    if row.th.string:
        songs.append(row.th.string.replace("\n","").replace('"','').strip())
        continue
        
    if row.th.a:
        songs.append(row.th.a.string.replace("\n","").replace('"','').strip())
        continue
        
    if row.th.contents[0].name == 'span':
        if row.th.span.string == None:
            songs.append(row.th.contents[1].replace("\n","").replace('"','').strip())
            continue
        else:
            songs.append(row.th.contents[0].string.replace("\n","").replace('"','').strip())
            continue
    songs.append(row.th.contents[0].string.replace("\n","").replace('"','').strip())

# WRITERS

for row in rows:
    if row.td and row.td.a:
        writers.append(', '.join([string for string in row.td.strings if string.strip() 
                                  and not 'note' in string 
                                  and not ' (' in string 
                                  and not 'Unleashed' in string 
                                  and not 'cover' in string]))
    if row.td and not row.td.a:
        if not row.td.span:
            writers.append(row.td.string.strip())
        else:
            writers.append(row.td.span.string.strip())

# ORIGINAL RELEASES 

for row in rows[1:]:
    data = row.find_all('td')[1]
    if 'Non-album single' in ''.join([string for string in data.strings]):
        original.append('Non-album single')
        continue
        
    if data.i and data.i.a:
        original.append(data.i.a.string)
        continue
    elif data.i:
        original.append(data.i.string)
        continue
    else:
        original.append('Unknown')

# PRODUCERS

for row in rows[1:]:
    
    data = row.find_all('td')[2]
    
    if '–' in str(data):
        producers.append('Unknown')
        continue
        
    if data.br:
        p = []
        for a in data.find_all('a'):
            p.append(a.string)
        producers.append(', '.join(p))
        continue
        
    if data.span and data.span.a:
        producers.append(data.span.a.string)
        continue
    
    if data.span:
        producers.append(data.span.string)
        continue
        
    if not data.string:
        producers.append('Unknown')
        continue

# RELEASE YEAR

for row in rows[1:]:
    data = row.find_all('td')[3]
    year.append((int(data.string[0:4]) if data.string else -1))

    
print(len(writers))
print(len(songs))
print(len(original))
print(len(producers))
print(len(year))

124
124
124
124
124


In [90]:
df = pd.DataFrame({
    'track_name': songs,
    'album_name' : original,
    'release_year': year,
    'producers' : producers,
    'writers' : writers
})

### Cleaning the data

In [91]:
df.drop(labels=[120,121,122, 123], axis=0, inplace=True)

In [92]:
df['release_year'] = df['release_year'].astype('int')

In [93]:
df['writers'] = df['writers'].apply(lambda x: x.rstrip(',  †\n'))

In [94]:
# Capializing every word in every track and album title
df.loc[:,['track_name', 'album_name']] = df.loc[:,['track_name', 'album_name']].apply(lambda x: x.str.title())

In [101]:
# Cleaning missing data format
df.loc[df['producers'] == 'Unknown'] = ''
df.loc[df['writers'] == 'Unknown'] = ''

In [102]:
df['writers'].value_counts()

writers
Kurt Cobain                                           54
                                                      33
Kurt Cobain, Dave Grohl, Krist Novoselic               7
Traditional                                            4
Dave Grohl                                             4
Kurt Cobain, Krist Novoselic                           3
Eugene Kelly, Frances McKee                            3
Curt Kirkwood                                          3
Greg Sage                                              2
Bob Ezrin, Kim Fowley, Paul Stanley                    1
John Cale, Sterling Morrison, Lou Reed, Moe Tucker     1
Robbie van Leeuwen                                     1
David Bowie                                            1
Johnny Hedlund                                         1
Kurt Cobain, Mark Lanegan                              1
Gerald Casale, Mark Mothersbaugh                       1
Name: count, dtype: int64

### Storing the data inside a .csv file

In [103]:
df.to_csv('nirvana_wiki.csv', index=False)