https://github.com/fivethirtyeight/data/blob/master/classic-rock/classic-rock-song-list.csv

#Step 1: Reading in the Data
 

##1a) Import All Libraries

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns

##1b) Load the CSV

###1b(i) Option 1: Rename the Column

In [8]:
column_names = ['song', 
                'artist', 
                'release', 
                'song_artist', 
                'first', 
                'year', 
                'playcount', 
                'fg',]

df = pd.read_csv('https://www.alvinang.sg/s/rock.csv', 
                 names=column_names, 
                 skiprows=1)

df.head()

#load the dataset and rename the columns

Unnamed: 0,song,artist,release,song_artist,first,year,playcount,fg
0,Caught Up in You,.38 Special,1982.0,Caught Up in You by .38 Special,1,1,82,82
1,Fantasy Girl,.38 Special,,Fantasy Girl by .38 Special,1,0,3,0
2,Hold On Loosely,.38 Special,1981.0,Hold On Loosely by .38 Special,1,1,85,85
3,Rockin' Into the Night,.38 Special,1980.0,Rockin' Into the Night by .38 Special,1,1,18,18
4,Art For Arts Sake,10cc,1975.0,Art For Arts Sake by 10cc,1,1,1,1


###1b)(ii) Option 2: Rename the Column

In [9]:
df = pd.read_csv('https://www.alvinang.sg/s/rock.csv')

rename_map = {
#Original column: renamed column
    'Song Clean':    'song', 
    'ARTIST CLEAN':  'artist', 
    'Release Year':  'release', 
    'COMBINED':      'song_artist', 
    'First?':        'first', 
    'Year?':         'year', 
    'PlayCount':     'playcount', 
    'F*G':           'fg'
}

df.rename(columns=rename_map, inplace=True)
df.head()

Unnamed: 0,song,artist,release,song_artist,first,year,playcount,fg
0,Caught Up in You,.38 Special,1982.0,Caught Up in You by .38 Special,1,1,82,82
1,Fantasy Girl,.38 Special,,Fantasy Girl by .38 Special,1,0,3,0
2,Hold On Loosely,.38 Special,1981.0,Hold On Loosely by .38 Special,1,1,85,85
3,Rockin' Into the Night,.38 Special,1980.0,Rockin' Into the Night by .38 Special,1,1,18,18
4,Art For Arts Sake,10cc,1975.0,Art For Arts Sake by 10cc,1,1,1,1




---



#Step 2: Check for NaNs

##2a) df.info()

In [10]:
df.info()
#'release' column has MANY NaNs --> due to 1653 non-null rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2230 entries, 0 to 2229
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   song         2230 non-null   object
 1   artist       2230 non-null   object
 2   release      1653 non-null   object
 3   song_artist  2230 non-null   object
 4   first        2230 non-null   int64 
 5   year         2230 non-null   int64 
 6   playcount    2230 non-null   int64 
 7   fg           2230 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 139.5+ KB


##2b) Total Up All NaNs

In [11]:
df.isnull().sum()
#'release' column has 577 rows of NaNs

song             0
artist           0
release        577
song_artist      0
first            0
year             0
playcount        0
fg               0
dtype: int64

##2c) Display all the Release NaN Rows

In [12]:
df[df['release'].isnull()]

Unnamed: 0,song,artist,release,song_artist,first,year,playcount,fg
1,Fantasy Girl,.38 Special,,Fantasy Girl by .38 Special,1,0,3,0
10,"Baby, Please Don't Go",AC/DC,,"Baby, Please Don't Go by AC/DC",1,0,1,0
13,CAN'T STOP ROCK'N'ROLL,AC/DC,,CAN'T STOP ROCK'N'ROLL by AC/DC,1,0,5,0
16,Girls Got Rhythm,AC/DC,,Girls Got Rhythm by AC/DC,1,0,24,0
24,Let's Get It Up,AC/DC,,Let's Get It Up by AC/DC,1,0,4,0
...,...,...,...,...,...,...,...,...
2216,"I'm Bad, I'm Nationwide",ZZ Top,,"I'm Bad, I'm Nationwide by ZZ Top",1,0,10,0
2218,Just Got Paid,ZZ Top,,Just Got Paid by ZZ Top,1,0,2,0
2221,My Head's In Mississippi,ZZ Top,,My Head's In Mississippi by ZZ Top,1,0,1,0
2222,Party On The Patio,ZZ Top,,Party On The Patio by ZZ Top,1,0,14,0


##2d) Replace all NaNs with Zeros

###2d(i) Option 1

In [13]:
df.loc[df['release'].isnull(), 'release'] = 0
df.head()

Unnamed: 0,song,artist,release,song_artist,first,year,playcount,fg
0,Caught Up in You,.38 Special,1982,Caught Up in You by .38 Special,1,1,82,82
1,Fantasy Girl,.38 Special,0,Fantasy Girl by .38 Special,1,0,3,0
2,Hold On Loosely,.38 Special,1981,Hold On Loosely by .38 Special,1,1,85,85
3,Rockin' Into the Night,.38 Special,1980,Rockin' Into the Night by .38 Special,1,1,18,18
4,Art For Arts Sake,10cc,1975,Art For Arts Sake by 10cc,1,1,1,1


###2d)(ii) Option 2: Alternative Way to Replace NaN with 0s

In [14]:
df['release'].fillna(0)

0       1982
1          0
2       1981
3       1980
4       1975
        ... 
2225       0
2226    1981
2227    1975
2228    1983
2229    1973
Name: release, Length: 2230, dtype: object

##2e) Check NaNs in Release Column

In [15]:
df['release'].isnull().sum()

#no more NaNs in Release Column!

0



---



#Step 3: Changing from String to Integer Type

##3a) df.info()

In [16]:
df.info()
#we want to change the Release column to 
#integer

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2230 entries, 0 to 2229
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   song         2230 non-null   object
 1   artist       2230 non-null   object
 2   release      2230 non-null   object
 3   song_artist  2230 non-null   object
 4   first        2230 non-null   int64 
 5   year         2230 non-null   int64 
 6   playcount    2230 non-null   int64 
 7   fg           2230 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 139.5+ KB


##3b) Attempting to change Release column fromn String to Integer

In [17]:
df['release'] = df['release'].map(lambda x: int(x))

#Value error: 'SONFACTS.COM'

ValueError: ignored

##3c) Check All Unique Values in Release Column

In [18]:
df.release.unique()

#there is Dirty Data 'SONGFACTS.COM'

array(['1982', 0, '1981', '1980', '1975', '2000', '2002', '1992', '1985',
       '1993', '1976', '1995', '1979', '1984', '1977', '1990', '1986',
       '1974', '2014', '1987', '1973', '2001', '1989', '1997', '1971',
       '1972', '1994', '1970', '1966', '1965', '1983', '1955', '1978',
       '1969', '1999', '1968', '1988', '1962', '2007', '1967', '1958',
       '1071', '1996', '1991', '2005', '2011', '2004', '2012', '2003',
       '1998', '2008', '1964', '2013', '2006', 'SONGFACTS.COM', '1963',
       '1961'], dtype=object)

##3d) Change SONGFACTS to 1972

In [19]:
df[df['release'] == "SONGFACTS.COM"]

Unnamed: 0,song,artist,release,song_artist,first,year,playcount,fg
1504,Bullfrog Blues,Rory Gallagher,SONGFACTS.COM,Bullfrog Blues by Rory Gallagher,1,1,1,1


In [20]:
df.loc[
       df['release'] == "SONGFACTS.COM", 
       'release']\
       =1972

In [21]:
df.iloc[1504:1505]
#Row 1504 Release has been changed to 1972

Unnamed: 0,song,artist,release,song_artist,first,year,playcount,fg
1504,Bullfrog Blues,Rory Gallagher,1972,Bullfrog Blues by Rory Gallagher,1,1,1,1


##3e) Retrying Conversion

In [22]:
df['release'] = df['release'].\
                map(lambda x: int(x))

In [23]:
df.info()

#Release has been changed to Integer!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2230 entries, 0 to 2229
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   song         2230 non-null   object
 1   artist       2230 non-null   object
 2   release      2230 non-null   int64 
 3   song_artist  2230 non-null   object
 4   first        2230 non-null   int64 
 5   year         2230 non-null   int64 
 6   playcount    2230 non-null   int64 
 7   fg           2230 non-null   int64 
dtypes: int64(5), object(3)
memory usage: 139.5+ KB




---



#Step 4: Adjusting the Minimum Release Year 

##4a) df.describe()

In [24]:
df['release'].describe()

#maximum year of release = 2014
#but Minimum year of release = 0?
#something is wrong???

count    2230.000000
mean     1466.215695
std       866.706564
min         0.000000
25%         0.000000
50%      1973.000000
75%      1981.000000
max      2014.000000
Name: release, dtype: float64

##4b) Adjusting the Min Release Year

In [25]:
min_release_year = df[df['release']>0]

#minimum release year allowed must be > 0

In [26]:
min_release_year['release'].describe()

#note that the earliest release year = 1071
#isn't it strange for songs to be releaseed 1071???

count    1653.000000
mean     1978.016334
std        24.184378
min      1071.000000
25%      1971.000000
50%      1977.000000
75%      1984.000000
max      2014.000000
Name: release, dtype: float64

##4c) Imposing a Cut Off for Release Year

In [27]:
min_release_year[df.release == 1071]

#row 547 has release year = 1071
#obviously this is wrong, its corrupted and should be replaced with something else

  """Entry point for launching an IPython kernel.


Unnamed: 0,song,artist,release,song_artist,first,year,playcount,fg
547,Levon,Elton John,1071,Levon by Elton John,1,1,8,8


In [28]:
above_1071 = df[df['release']>1071]

In [29]:
above_1071.release.describe()

#now minimum year of release is 1955

count    1652.000000
mean     1978.565375
std         9.308364
min      1955.000000
25%      1971.000000
50%      1977.000000
75%      1984.000000
max      2014.000000
Name: release, dtype: float64

In [30]:
above_1071.iloc[400:403]

#now Row 547 has been removed!

Unnamed: 0,song,artist,release,song_artist,first,year,playcount,fg
546,Honky Cat,Elton John,1972,Honky Cat by Elton John,1,1,2,2
548,Madman Across The Water,Elton John,1971,Madman Across The Water by Elton John,1,1,1,1
549,Rocket Man,Elton John,1972,Rocket Man by Elton John,1,1,68,68




---



#Step 5: Creating a Function to Validate if Release Year < 1970

##5a) Create the Function

In [60]:
def release_inspector(row):
    print('---------------------------------------------------------')
    print('Title:', row['song'],'|', 
          'Artiste:', row['artist'], '|', 
          'Release:', row['release'], '|', 
          'Is it < 1970?:',  
          row['release'] < 1970)
    return None

##5b) Apply the Function

In [61]:
df.sample(5).apply(release_inspector, axis = 1)

---------------------------------------------------------
Title: Those Shoes | Artiste: Eagles | Release: 1979 | Is it < 1970?: False
---------------------------------------------------------
Title: Atomic Punk | Artiste: Van Halen | Release: 0 | Is it < 1970?: True
---------------------------------------------------------
Title: Mista Bone | Artiste: Great White | Release: 0 | Is it < 1970?: True
---------------------------------------------------------
Title: You Better Run | Artiste: Pat Benatar | Release: 1980 | Is it < 1970?: False
---------------------------------------------------------
Title: Funk #49 | Artiste: Joe Walsh | Release: 2012 | Is it < 1970?: False


513     None
2117    None
730     None
1239    None
904     None
dtype: object



---



#THE END