In [None]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  

In [None]:
df = pd.read_csv('../final_merge.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
#df.columns.values[0] = 'id'
df.head()

In [None]:
df2 = df[(pd.isnull(df.genre1_cd1) == False) | (pd.isnull(df.genre1_cd2) == False) | (pd.isnull(df.genre1_cd2c) == False)].copy()

In [None]:
df2_year = df2.groupby(['year']).size().reset_index(name='counts')

First we check the number of song per year we have in the dataset. As expected we see an increase in the number of songs over the year except for 2010, this is probably because the year 2010 was just ending when the dataset was created and the 2010 songs hadn't had the time to attain their maximum popularity.

In [None]:
df2_year.iloc[1:, :].plot(x='year', y='counts', kind='line')

As we can see there is not a lot of songs before 60's, thus we will drop this song to continues a meaningful analysis.

In [None]:
df2 = df2[df2.year > 1960]

In [None]:
genres = set([])
genres_cols = ['genre1_cd2c', 'genre2_cd2c', 'genre1_cd2', 'genre2_cd2', 'genre1_cd1', 'genre2_cd1']
for col_name in genres_cols:
    genres = genres | set(df2[col_name].unique())
print(genres)
print(len(genres))

We have 17 different genres (nan are unkownn and international is the same as world). To do a meaningful analysis of the genre analysis over the year a minimum amount of songs of the analyzed type must be in the dataset. In the following cells we first replace the nan and replace International by World. 

In [None]:
df2[genres_cols] = df2[genres_cols].fillna('Unknown')
df2[genres_cols] = df2[genres_cols].replace('International', 'World')

In [None]:
genres = set([])
genres_cols = ['genre1_cd2c', 'genre2_cd2c', 'genre1_cd2', 'genre2_cd2', 'genre1_cd1', 'genre2_cd1']
for col_name in genres_cols:
    genres = genres | set(df2[col_name].unique())
    df2[col_name] = df2[col_name].astype(str)
print(genres)
print(len(genres))

In [None]:
df2[genres_cols].head()

For the moment we have 6 columns for the genres, we would like to see if we can summarize these columns in one or two columns.
First we perform a pivot and count the number of different values there are in each column. 

In [None]:
for genre in list(genres):
    df2[genre] = 0
    for col_name in genres_cols:
        df2.loc[df2[col_name] == genre, genre] = 1
df2 = df2.drop(columns=['Unknown'])
genres.remove('Unknown')

In [None]:
df2['nb_genre'] = np.sum(df2.iloc[:, -17:].values, axis=1)

In [None]:
df2['nb_genre'].plot(kind='hist')

So we see that the majority of the songs have 1 or 2 different genres, some also have 3 genres and 4 genres is atypical. We can now drop the 6 columns containing the label genres.

In [None]:
df2 = df2.drop(columns=genres_cols)

Now for each genre we plot the number of sample per year.

In [None]:
int(len(list(genres))/3.0 + .5)
f, axarr = plt.subplots(int(len(list(genres))/3.0 + .5), 3)
f.set_size_inches(15, 20)
plt.subplots_adjust(hspace=.4)
i = 0
all_data = {}
for genre in genres:
    data_genre = df2[df2[genre] == 1].groupby(['year']).size().reset_index(name='counts')
    data_genre.plot(x='year', y='counts', kind='line', title=genre, ax=axarr[int(i/3), i%3])
    fig = axarr[int(i/3), i%3].get_figure()
    extent = axarr[int(i/3), i%3].get_window_extent().transformed(fig.dpi_scale_trans.inverted())
    data = {}
    data['years'] = list(data_genre['year'].astype(str).values)
    data['count'] = list(data_genre['counts'].values)
    all_data[genre] = data
    #fig.savefig('figures/%s_distri_year.png' % (genre), bbox_inches=extent.expanded(1.2, 1.15), dpi = 500)
    i+=1
f = open('counts.json','w')
f.write(str(all_data))
f.close()

These plots are useful to see the data we have in our hands. 

Firstly we observe that most of the music we have is rock, pop, pop_rock, electronic or metal. On the opposite World, Latin, blues are not very represented. This can be explained whether because the dataset is biased but also because some genre are more popular. Indeed Latin music is sub-represented although there is a very important latin culture in the world. These observations could be made more precise by using only the total number of songs for each genre.

Secondly these plots enable us to see some trends in the evolution of the music. If we suppose the dataset is not too much biased for the genre the most represented we can make some interesting observations. We can see that punk music suddenly appears in the middle of the 70's. Rock started in the 60's and grows exponentially since this moment. Indeed these plots are useful to tell something about when the genre appears and how it has evolved since this moment. 

Now we want to look how genre are connected, so let's construct a graph in which nodes are the genre and connection between genre appears when a song has both genres. The weight of the connection is given by the number of songs.

In [None]:
adj_mat_genres = np.zeros([len(genres), len(genres)])
genres = list(genres)
for i in range(len(genres)):
    for j in range(i, len(genres)):
        nb_songs = df2[(df2[genres[i]] == 1) & (df2[genres[j]] == 1)].shape[0]
        adj_mat_genres[i, j] = nb_songs
        adj_mat_genres[j, i] = nb_songs

adj_mat_genres

In [None]:
adj_df = pd.DataFrame(adj_mat_genres, columns=genres)

In [None]:
adj_df['genre'] = genres

In [None]:
adj_df['radius'] = (np.diag(adj_mat_genres))**.5
adj_df['id'] = range(len(adj_df))

In [None]:
adj_df[['radius', 'id', 'genre']].T.to_dict().values()

In [None]:
import json
a = adj_df[['radius', 'id', 'genre']].T.to_dict().values()
list(a)

In [None]:
adj_df = adj_df.iloc[:, :-3]

In [None]:
adj = adj_df.values

In [None]:
edges = []
for i in range(len(adj)):
    for j in range(i+1, len(adj)):
        edge = {'source_id': i, 'target_id': j, 'stroke_width': adj[i, j]/1000}
        edges.append(edge)
edges

In [None]:
%%html
<iframe src="http://www.cbinge.com/file/test.html" width=1000 height = 1000/>

In [None]:
from sklearn.ensemble import RandomForestRegressor
int(len(list(genres))/3.0 + .5)
f, axarr = plt.subplots(int(len(list(genres))/3.0 + .5), 3)
f.set_size_inches(15, 20)
plt.subplots_adjust(hspace=.4)
i = 0
all_data = {}

# Compute the avg hotness by year for all the data
hottness_avg = df2[df2['song_hotttnesss'].notna()].groupby(['year']).mean().reset_index()
regr = RandomForestRegressor(n_estimators=10, n_jobs=-1, max_depth= 5)
regr.fit(hottness_avg[['year']], hottness_avg['song_hotttnesss'])
hottness_predict = regr.predict(np.array(list(range(1960, 2011))).reshape(-1, 1))
data = {}
data['years'] = list(hottness_avg['year'].astype(str).values)
data['hottness'] = list(hottness_avg['song_hotttnesss'].values)
data['predict'] = list(hottness_predict)
all_data['avg'] = data

# Compute it for each genre
for genre in genres:
    regr = RandomForestRegressor(n_estimators=10, n_jobs=-1, max_depth= 5)
    hottness = df2[(df2[genre] == 1) & df2['song_hotttnesss'].notna()].groupby(['year']).mean().reset_index()
    hottness = hottness[hottness['song_hotttnesss'] > 0]
    regr.fit(hottness[['year']], hottness['song_hotttnesss'])
    hottness.plot(x='year', y='song_hotttnesss', kind='scatter', title=genre, ax=axarr[int(i/3), i%3], color='orange')
    hottness_predict = regr.predict(np.array(list(range(1960, 2011))).reshape(-1, 1))
    axarr[int(i/3), i%3].plot(list(range(1960, 2011)), hottness_predict)
    data = {}
    data['years'] = list(hottness['year'].astype(str).values)
    data['hottness'] = list(hottness['song_hotttnesss'].values)
    data['predict'] = list(hottness_predict)
    all_data[genre] = data
    i+=1
f = open('hottness.json','w')
f.write(str(all_data))
f.close()

# Milestone 3
### Analysis of the distribution of the different features for each genre
Look at the empirical probability function of the genre.
### Look at the influence of the year of these distribution
Is genre time invariant or not?
### Visualize the interesting results obtained
Visualization by using graph evolving with time.

In [None]:
for genre in genres:
    tmp = df2[df2[genre] == 1].reset_index()
    text = tmp[['song_hotttnesss', 'duration', 'speechiness', 'acousticness', 'instrumentalness']].describe().to_html()
    f = open('figures/%s.tab'%genre,'w')
    f.write(text)
    f.close()

In [None]:
df2.columns

In [None]:
from sklearn.ensemble import RandomForestRegressor

for col in ['song_hotttnesss', 'duration', 'speechiness', 'acousticness', 'instrumentalness']:
    all_data = {}

    # Compute the avg hotness by year for all the data
    avg = df2[df2[col].notna()].groupby(['year']).mean().reset_index()
    regr = RandomForestRegressor(n_estimators=10, n_jobs=-1, max_depth= 5)
    regr.fit(hottness_avg[['year']], avg[col])
    predict = regr.predict(avg['year'].values.reshape(-1, 1))
    data = {}
    data['years'] = list(avg['year'].astype(str).values)
    data[col] = list(avg[col].values)
    data['predict'] = list(predict)
    all_data['avg'] = data

    # Compute it for each genre
    print("%s: " % col)
    for genre in genres:
        regr = RandomForestRegressor(n_estimators=10, n_jobs=-1, max_depth= 5)
        datas = df2[(df2[genre] == 1) & df2[col].notna()]
        col_data = datas.groupby(['year']).mean().reset_index()
        col_data = col_data[col_data[col] > 0]
        regr.fit(col_data[['year']], col_data[col])
        predict = regr.predict(hottness_avg['year'].values.reshape(-1, 1))
        data = {}
        data['years'] = list(hottness['year'].astype(str).values)
        data[col] = list(col_data[col].values)
        data['predict'] = list(predict)
        all_data[genre] = data
        i+=1
        avg += np.mean(col_data[col].values)
        print("\t %s: Nb songs = %d and Avg Value is %f" % (genre, len(datas), np.mean(data[col])))
    f = open('%s.json' % col,'w')
    f.write(str(all_data))
    f.close()

In [None]:
len(df2[df2['speechiness'].notna()])