In [4]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import linregress
import sys
import os

In [5]:
#read the data sets
csv_path = "Resources/data.csv"
music_df = pd.read_csv(csv_path)

In [6]:
csv_path = "Resources/data_w_genres.csv"
genre_df = pd.read_csv(csv_path)

In [7]:
#checking if a random artist is on both data sets
genre_df.loc[(genre_df["artists"] == "Louis Armstrong")]

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
15222,"['adult standards', 'dixieland', 'harlem renai...",Louis Armstrong,0.790156,0.547011,230558.994728,0.289108,0.088578,0.273495,-12.883371,0.098098,111.638654,0.531922,20.652021,0,1,569


In [8]:
#clean the "artists" column in order to merge the two data sets for further analysis
spec_chars = ["[","]",'"',"'"]
for char in spec_chars:
    music_df['artists'] = music_df['artists'].str.replace(char, '')


In [12]:
#merge cvs with genres and the main date set
merge_genre = music_df.merge(genre_df,on=["artists"],how = "left")

Unnamed: 0,valence_x,year,acousticness_x,artists,danceability_x,duration_ms_x,energy_x,explicit,id,instrumentalness_x,...,instrumentalness_y,liveness_y,loudness_y,speechiness_y,tempo_y,valence_y,popularity_y,key_y,mode_y,count
59271,0.896,1940,0.991,Tasos Xalkias-Klarino,0.271,192213,0.587,0,1N2FssxTSKJTu3EmRaFZu8,0.949,...,0.379177,0.143833,-7.362,0.038733,147.594667,0.637,0.0,11.0,1.0,6.0
86236,0.128,1990,0.846,Matt Monro,0.271,165493,0.123,0,4ZEvI1ujaR0jnGw7ifAkbj,6e-06,...,0.010588,0.20969,-12.7483,0.031885,100.85345,0.323635,34.15,0.0,1.0,40.0
65872,0.878,1974,0.634,"David Cassidy, The Partridge Family",0.742,167733,0.387,0,5CXfTjIX7bAlFxSlMBaJVU,0.0,...,,,,,,,,,,
87325,0.824,1995,0.275,Industria del Amor,0.64,212400,0.549,0,2FPS82xiS3IiE792Q7ktSP,0.0015,...,0.001565,0.222681,-10.838381,0.036924,122.837095,0.660143,43.761905,0.0,1.0,42.0
151376,0.663,1998,0.0917,Beastie Boys,0.833,165140,0.728,0,3Qd5Bv2bRJ7JbeVY8hidKV,0.565,...,0.149061,0.232422,-9.438254,0.209689,121.909947,0.646698,43.671958,1.0,1.0,189.0
59246,0.174,1940,0.983,Giorgos Meintanas,0.248,216960,0.232,0,1I1oDxkeLqFN35do8ZuPhx,0.886,...,0.52775,0.30425,-9.87025,0.0468,129.96825,0.62725,0.0,5.0,0.0,8.0
4284,0.952,1943,0.775,Lead Belly,0.566,101851,0.456,0,07iRjjJcOIAttGkwF6ZGqq,1.1e-05,...,0.127189,0.20447,-11.944948,0.16529,119.450294,0.734498,3.022157,4.0,1.0,677.0
31143,0.132,1983,0.000626,Metallica,0.224,254560,0.886,0,543qcqtmy9eCke65RWRVaQ,0.834,...,0.236783,0.324996,-6.287531,0.097809,124.929991,0.345296,40.346491,4.0,1.0,456.0
65531,0.619,1972,0.641,"Harold Melvin & The Blue Notes, Sharon Paige",0.552,219533,0.563,0,2hD8PVG48UDB0e8HvRGVrf,0.000451,...,,,,,,,,,,
107881,0.891,2016,0.762,Kevin Morby,0.826,375903,0.423,0,0prVUHQu2xr6pI5iEbKtxo,0.0689,...,0.23445,0.10015,-10.6875,0.0371,109.3555,0.688,62.0,1.0,1.0,4.0


In [13]:
# Create dataframe with only necesary columns
year_df = merge_genre[["year","name","artists","popularity_x","genres"]]

In [14]:
year_df.head(20)

Unnamed: 0,year,name,artists,popularity_x,genres
0,1921,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...","Sergei Rachmaninoff, James Levine, Berliner Ph...",4,
1,1921,Clancy Lowered the Boom,Dennis Day,5,[]
2,1921,Gati Bali,KHP Kridhamardawa Karaton Ngayogyakarta Hadini...,5,[]
3,1921,Danny Boy,Frank Parker,3,[]
4,1921,When Irish Eyes Are Smiling,Phil Regan,2,[]
5,1921,Gati Mardika,KHP Kridhamardawa Karaton Ngayogyakarta Hadini...,6,[]
6,1921,The Wearing of the Green,John McCormack,4,"['irish ballad', 'vintage classical singing']"
7,1921,"Morceaux de fantaisie, Op. 3: No. 2, Prélude i...",Sergei Rachmaninoff,2,"['classical', 'post-romantic era', 'russian ro..."
8,1921,La Mañanita - Remasterizado,Ignacio Corsini,0,"['tango', 'vintage tango']"
9,1921,Il Etait Syndiqué,Fortugé,0,['vintage chanson']


In [None]:
#group by & find the max

In [15]:
#groupby years
max_popularity = year_df.groupby(["year"])["popularity_x"].max()
max_popularity = max_popularity.reset_index()

popular_by_year = max_popularity.merge(year_df,on=["year","popularity_x"],how = "left")
popular_by_year


Unnamed: 0,year,popularity_x,name,artists,genres
0,1921,6,Gati Mardika,KHP Kridhamardawa Karaton Ngayogyakarta Hadini...,[]
1,1922,6,Carve,Uli,"['corrido', 'nuevo regional mexicano', 'region..."
2,1923,42,Weather Bird,"Louis Armstrong, Earl Hines",
3,1923,42,Nobody Knows You When You're Down and Out,Bessie Smith,"['blues', 'harlem renaissance', 'jazz blues', ..."
4,1924,13,Mississippi Mud (feat. Bix Beiderbecke),"Paul Whiteman & His Orchestra, Bix Beiderbecke",
...,...,...,...,...,...
109,2017,87,Believer,Imagine Dragons,"['modern rock', 'pop', 'rock']"
110,2018,89,Lucid Dreams,Juice WRLD,"['chicago rap', 'melodic rap']"
111,2018,89,lovely (with Khalid),"Billie Eilish, Khalid",
112,2019,94,Watermelon Sugar,Harry Styles,"['pop', 'post-teen pop']"


In [16]:
#use loc function to find the top ten in each decate
decade_1 = year_df.loc[(year_df["year"]>=1921)&(year_df["year"]<=1930)]
top_ten_1 = decade_1.sort_values("popularity_x",ascending = False)
top_ten_1 = top_ten_1.head(10)
top_ten_1

Unnamed: 0,year,name,artists,popularity_x,genres
1606,1930,Monster Faladoré,Joe Quartz,55,[]
1406,1929,Mack the Knife,Louis Armstrong,52,"['adult standards', 'dixieland', 'harlem renai..."
1206,1928,"Sing, Sing, Sing",Benny Goodman,51,"['adult standards', 'bebop', 'big band', 'jazz..."
806,1926,Ain't Misbehavin',Fats Waller,49,"['adult standards', 'bebop', 'dixieland', 'har..."
1607,1930,"Hungarian Rhapsody No. 2 in C-Sharp Minor, S. ...","Franz Liszt, Vladimir Horowitz",48,
1207,1928,"Dark Was the Night, Cold Was the Ground",Blind Willie Johnson,47,"['acoustic blues', 'appalachian folk', 'blues'..."
807,1926,Two Sleepy People,Fats Waller,45,"['adult standards', 'bebop', 'dixieland', 'har..."
222,1923,Nobody Knows You When You're Down and Out,Bessie Smith,42,"['blues', 'harlem renaissance', 'jazz blues', ..."
1208,1928,It's Nobody's Fault but Mine,Blind Willie Johnson,42,"['acoustic blues', 'appalachian folk', 'blues'..."
221,1923,Weather Bird,"Louis Armstrong, Earl Hines",42,


In [17]:
decade_2 = year_df.loc[(year_df["year"]>=1931)&(year_df["year"]<=1940)]
top_ten_2 = decade_2.sort_values("popularity_x",ascending = False)
top_ten_2 = top_ten_2.head(10)
top_ten_2

Unnamed: 0,year,name,artists,popularity_x,genres
2206,1933,All of Me (with Eddie Heywood & His Orchestra),"Billie Holiday, Eddie Heywood",64,
3406,1939,"It's Been a Long, Long Time",Harry James,60,"['adult standards', 'big band', 'jazz trumpet'..."
3407,1939,Moonlight Serenade,Glenn Miller,54,"['adult standards', 'big band', 'easy listenin..."
3006,1937,Cross Road Blues,Robert Johnson,54,"['acoustic blues', 'blues', 'blues rock', 'del..."
2208,1933,Tea for Two,Art Tatum,53,"['bebop', 'big band', 'contemporary post-bop',..."
3551,1939,In the Mood - Live,Glenn Miller,52,"['adult standards', 'big band', 'easy listenin..."
2209,1933,Gloomy Sunday (with Teddy Wilson & His Orchest...,"Billie Holiday, Teddy Wilson",52,
2210,1933,Summertime,Billie Holiday,52,"['adult standards', 'harlem renaissance', 'jaz..."
3409,1939,Where Did You Sleep Last Night,Lead Belly,52,"['acoustic blues', 'appalachian folk', 'blues'..."
3408,1939,In the Mood,Glenn Miller,51,"['adult standards', 'big band', 'easy listenin..."


In [18]:
decade_3 = year_df.loc[(year_df["year"]>=1941)&(year_df["year"]<=1950)]
top_ten_3 = decade_3.sort_values("popularity_x",ascending = False)
top_ten_3 = top_ten_3.head(10)
top_ten_3

Unnamed: 0,year,name,artists,popularity_x,genres
4006,1942,White Christmas,"Bing Crosby, Ken Darby Singers, John Scott Tro...",76,
4607,1945,It's Beginning To Look Like Christmas,Bing Crosby,71,"['adult standards', 'easy listening']"
5006,1947,Here Comes Santa Claus (Right Down Santa Claus...,Gene Autry,70,"['adult standards', 'cowboy western', 'oklahom..."
4406,1944,Have Yourself A Merry Little Christmas,Judy Garland,70,"['adult standards', 'hollywood', 'lounge', 'to..."
4606,1945,Mele Kalikimaka (Merry Christmas),"Bing Crosby, The Andrews Sisters",69,
5407,1949,Gymnopédie No. 1,"Erik Satie, Philippe Entremont",67,
4608,1945,I'll Be Home For Christmas,Bing Crosby,66,"['adult standards', 'easy listening']"
4609,1945,God Rest Ye Merry Gentlemen,Bing Crosby,64,"['adult standards', 'easy listening']"
5406,1949,Twelve Days Of Christmas,"Bing Crosby, The Andrews Sisters",64,
5206,1948,"Whatever Will Be, Will Be (Que Sera, Sera) (wi...","Doris Day, Frank DeVol & His Orchestra",62,


In [19]:
decade_4 = year_df.loc[(year_df["year"]>=1951)&(year_df["year"]<=1960)]
top_ten_4 = decade_4.sort_values("popularity_x",ascending = False)
top_ten_4 = top_ten_4.head(10)
top_ten_4

Unnamed: 0,year,name,artists,popularity_x,genres
7406,1959,Let It Snow! Let It Snow! Let It Snow!,Dean Martin,81,"['adult standards', 'easy listening', 'lounge'..."
7606,1960,At Last,Etta James,76,"['adult standards', 'jazz blues', 'soul', 'sou..."
7409,1959,Johnny B. Goode,Chuck Berry,75,"['blues rock', 'classic rock', 'rock', 'rock-a..."
7006,1957,Blue Christmas,Elvis Presley,74,"['rock-and-roll', 'rockabilly']"
7207,1958,Jailhouse Rock,Elvis Presley,73,"['rock-and-roll', 'rockabilly']"
7007,1957,Dream A Little Dream Of Me - Single Version,"Ella Fitzgerald, Louis Armstrong",72,
7411,1959,Rudolph The Red-Nosed Reindeer,Dean Martin,70,"['adult standards', 'easy listening', 'lounge'..."
7206,1958,Come Fly With Me - Remastered,Frank Sinatra,70,"['adult standards', 'easy listening', 'lounge']"
7607,1960,A Sunday Kind Of Love,Etta James,70,"['adult standards', 'jazz blues', 'soul', 'sou..."
7608,1960,Sleigh Ride,Ella Fitzgerald,69,"['adult standards', 'jazz blues', 'swing', 'vo..."


In [20]:
decade_5 = year_df.loc[(year_df["year"]>=1961)&(year_df["year"]<=1970)]
top_ten_5 = decade_5.sort_values("popularity_x",ascending = False)
top_ten_5 = top_ten_5.head(10)
top_ten_5

Unnamed: 0,year,name,artists,popularity_x,genres
8406,1964,Rockin' Around The Christmas Tree,Brenda Lee,85,"['adult standards', 'brill building pop']"
8206,1963,It's the Most Wonderful Time of the Year,Andy Williams,83,"['adult standards', 'brill building pop', 'eas..."
9407,1969,Here Comes The Sun - Remastered 2009,The Beatles,82,"['beatlesque', 'british invasion', 'classic ro..."
9406,1969,Fortunate Son,Creedence Clearwater Revival,81,"['album rock', 'classic rock', 'country rock',..."
9007,1967,Ain't No Mountain High Enough,"Marvin Gaye, Tammi Terrell",80,
8207,1963,Sleigh Ride,The Ronettes,80,"['brill building pop', 'classic girl group', '..."
8606,1965,My Girl,The Temptations,79,"['brill building pop', 'classic soul', 'funk',..."
9006,1967,Brown Eyed Girl,Van Morrison,79,"['classic rock', 'folk', 'folk rock', 'mellow ..."
9410,1969,Come Together - Remastered 2009,The Beatles,79,"['beatlesque', 'british invasion', 'classic ro..."
8806,1966,California Dreamin' - Single Version,The Mamas & The Papas,79,"['brill building pop', 'bubblegum pop', 'class..."


In [21]:
decade_6 = year_df.loc[(year_df["year"]>=1971)&(year_df["year"]<=1980)]
top_ten_6 = decade_6.sort_values("popularity_x",ascending = False)
top_ten_6 = top_ten_6.head(10)
top_ten_6

Unnamed: 0,year,name,artists,popularity_x,genres
11006,1977,Dreams - 2004 Remaster,Fleetwood Mac,89,"['album rock', 'classic rock', 'mellow gold', ..."
11606,1980,Back In Black,AC/DC,84,"['album rock', 'australian rock', 'hard rock',..."
11406,1979,Highway to Hell,AC/DC,84,"['album rock', 'australian rock', 'hard rock',..."
10806,1976,Hotel California - 2013 Remaster,Eagles,83,"['album rock', 'classic rock', 'country rock',..."
10406,1974,Sweet Home Alabama,Lynyrd Skynyrd,82,"['album rock', 'blues rock', 'classic rock', '..."
11007,1977,The Chain - 2004 Remaster,Fleetwood Mac,82,"['album rock', 'classic rock', 'mellow gold', ..."
10006,1972,"Rocket Man (I Think It's Going To Be A Long, L...",Elton John,82,"['glam rock', 'mellow gold', 'piano rock', 'so..."
11008,1977,Mr. Blue Sky,Electric Light Orchestra,81,"['album rock', 'art rock', 'beatlesque', 'bow ..."
10606,1975,Landslide,Fleetwood Mac,80,"['album rock', 'classic rock', 'mellow gold', ..."
11610,1980,You Shook Me All Night Long,AC/DC,80,"['album rock', 'australian rock', 'hard rock',..."


In [22]:
decade_7 = year_df.loc[(year_df["year"]>=1981)&(year_df["year"]<=1990)]
top_ten_7 = decade_7.sort_values("popularity_x",ascending = False)
top_ten_7 = top_ten_7.head(10)
top_ten_7

Unnamed: 0,year,name,artists,popularity_x,genres
12208,1983,Every Breath You Take,The Police,84,"['album rock', 'art rock', 'classic rock', 'da..."
12607,1985,Take on Me,a-ha,84,"['album rock', 'classic rock', 'folk rock', 'm..."
12007,1982,Billie Jean,Michael Jackson,83,"['pop', 'r&b', 'soul']"
13606,1990,Thunderstruck,AC/DC,83,"['album rock', 'australian rock', 'hard rock',..."
12806,1986,Livin' On A Prayer,Bon Jovi,83,"['glam metal', 'rock']"
12006,1982,Africa,TOTO,83,"['album rock', 'classic rock', 'mellow gold', ..."
12406,1984,Summer Of '69,Bryan Adams,82,"['album rock', 'canadian pop', 'canadian singe..."
12606,1985,Everybody Wants To Rule The World,Tears For Fears,82,"['dance rock', 'new romantic', 'new wave', 'ne..."
13006,1987,I Wanna Dance with Somebody (Who Loves Me),Whitney Houston,82,"['dance pop', 'pop', 'urban contemporary']"
12206,1983,Sweet Dreams (Are Made of This) - Remastered,"Eurythmics, Annie Lennox, Dave Stewart",81,


In [23]:
decade_8 = year_df.loc[(year_df["year"]>=1991)&(year_df["year"]<=2000)]
top_ten_8 = decade_8.sort_values("popularity_x",ascending = False)
top_ten_8 = top_ten_8.head(10)
top_ten_8

Unnamed: 0,year,name,artists,popularity_x,genres
14406,1994,All I Want for Christmas Is You,Mariah Carey,88,"['dance pop', 'pop', 'r&b', 'urban contemporary']"
15608,2000,In the End,Linkin Park,84,"['alternative metal', 'nu metal', 'post-grunge..."
15606,2000,Yellow,Coldplay,84,"['permanent wave', 'pop']"
14006,1992,Jingle Bell Rock,Bobby Helms,83,[]
13812,1991,Losing My Religion,R.E.M.,82,"['alternative rock', 'classic rock', 'permanen..."
14206,1993,Creep,Radiohead,82,"['alternative rock', 'art rock', 'melancholia'..."
14607,1995,Gangsta's Paradise,"Coolio, L.V.",82,
15410,1999,Californication,Red Hot Chili Peppers,81,"['alternative rock', 'funk metal', 'funk rock'..."
13808,1991,Under the Bridge,Red Hot Chili Peppers,81,"['alternative rock', 'funk metal', 'funk rock'..."
14410,1994,Zombie,The Cranberries,81,"['irish rock', 'pop rock']"


In [24]:
decade_9 = year_df.loc[(year_df["year"]>=2001)&(year_df["year"]<2010)]
top_ten_9 = decade_9.sort_values("popularity_x",ascending = False)
top_ten_9 = top_ten_9.head(10)
top_ten_9

Unnamed: 0,year,name,artists,popularity_x,genres
16006,2002,'Till I Collapse,"Eminem, Nate Dogg",84,
16009,2002,The Scientist,Coldplay,84,"['permanent wave', 'pop']"
17207,2008,I'm Yours,Jason Mraz,83,"['acoustic pop', 'neo mellow', 'pop', 'pop rock']"
16806,2006,Last Christmas - Remastered,Wham!,83,"['dance pop', 'dance rock', 'disco', 'europop'..."
16008,2002,Without Me,Eminem,83,"['detroit hip hop', 'hip hop', 'rap']"
16611,2005,Hips Don't Lie (feat. Wyclef Jean),"Shakira, Wyclef Jean",83,
16839,2006,Promiscuous,"Nelly Furtado, Timbaland",82,
16616,2005,Fix You,Coldplay,82,"['permanent wave', 'pop']"
16407,2004,Yeah! (feat. Lil Jon & Ludacris),"Usher, Lil Jon, Ludacris",82,
16206,2003,Numb,Linkin Park,82,"['alternative metal', 'nu metal', 'post-grunge..."


In [25]:
decade_10 = year_df.loc[(year_df["year"]>=2011)&(year_df["year"]<2020)]
top_ten_10 = decade_10.sort_values("popularity_x",ascending = False)
top_ten_10 = top_ten_10.head(10)
top_ten_10

Unnamed: 0,year,name,artists,popularity_x,genres
19407,2019,Watermelon Sugar,Harry Styles,94,"['pop', 'post-teen pop']"
19426,2019,Breaking Me,"Topic, A7S",92,
19413,2019,Someone You Loved,Lewis Capaldi,91,"['pop', 'uk pop']"
19406,2019,Sofia,Clairo,90,"['bedroom pop', 'boston indie']"
19443,2019,Memories,Maroon 5,89,"['pop', 'pop rock']"
19207,2018,Lucid Dreams,Juice WRLD,89,"['chicago rap', 'melodic rap']"
19210,2018,lovely (with Khalid),"Billie Eilish, Khalid",89,
19441,2019,Dance Monkey,Tones And I,89,['australian pop']
19417,2019,Golden,Harry Styles,89,"['pop', 'post-teen pop']"
19409,2019,Circles,Post Malone,89,"['dfw rap', 'melodic rap', 'rap']"
