# Feature analysis



In [None]:
import pandas as pd
import sqlite3
from sqlite3 import Error
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
df = pd.read_csv('../final_merge.csv')
df['song_hotttnesss'] = df['song_hotttnesss'].fillna(0.0)

Let's put the dataframe in a good form for the rest of the analysis.

First, we remove all useless column, or the one we don't want to describe a song.

In [None]:
df.columns

In [None]:
df_stage1 = df.drop(['Unnamed: 0', 'artist_latitude', 'artist_location', 'artist_id', 'artist_longitude', 'song_id', 'track_id', 'artist_hotttnesss', 'mode_x', 'mode_y'], axis=1)

We will need a one hot encoded matrix for genre for future analysis. We need all the genres in a first time.

In [None]:
genres = set([])
genres_cols = ['genre1_cd2c', 'genre2_cd2c', 'genre1_cd2', 'genre2_cd2', 'genre1_cd1', 'genre2_cd1']
for col_name in genres_cols:
    genres = genres | set(df_stage1[col_name].unique())
print(genres)
print(len(genres))

We have 17 different genres (nan are unkownn and international is the same as world). To do a meaningful analysis of the genre analysis over the year a minimum amount of songs of the analyzed type must be in the dataset. In the following cells we first replace the nan and replace International by World. 

In [None]:
df_stage1[genres_cols] = df_stage1[genres_cols].fillna('Unknown')
df_stage1[genres_cols] = df_stage1[genres_cols].replace('International', 'World')

Let's just check that we have the wanting form

In [None]:
df_stage1[genres_cols].head()

For the moment we have 6 columns for the genres, we would like to see if we can summarize these columns in one or two columns.
First we perform a pivot and count the number of different values there are in each column. 

In [None]:
genres = set([])
genres_cols = ['genre1_cd2c', 'genre2_cd2c', 'genre1_cd2', 'genre2_cd2', 'genre1_cd1', 'genre2_cd1']
for col_name in genres_cols:
    genres = genres | set(df_stage1[col_name].unique())
    df_stage1[col_name] = df_stage1[col_name].astype(str)
print(genres)
print(len(genres))

In [None]:
for genre in list(genres):
    df_stage1[genre] = 0
    for col_name in genres_cols:
        df_stage1.loc[df_stage1[col_name] == genre, genre] = 1
df_stage1.drop(columns=['Unknown'], axis=1, inplace=True)
genres.remove('Unknown')

Let's make sure we have a hot encoded matrix at the end now

In [None]:
df_stage2 = df_stage1.drop(genres_cols, axis=1).copy()
df_stage2.head()

Now that the matrix is ready, let's see if we can spot something from a simple correlation between the data, especially between song_hotttnesss and the rest of the features.

In [None]:
def plot_corr(df, output_file_name):
    '''
        Plot the correlation matrix of a dataframe
        The plot will be triangular with negative values blue and positive values red
        Code taken from https://seaborn.pydata.org/examples/many_pairwise_correlations.html
        :param df: The dataframe
        :type df: DataFrame
    '''
    sns.set(style="white", font_scale=1.5)

    # Compute the correlation matrix
    corr = df.corr()

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(16, 12))
    f.suptitle("Correlation heatmap")

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    snsplot = sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})
    
    snsplot.figure.savefig(output_file_name)


A final remark about song year. Most of them are in fact 0:

In [None]:
df_stage2[df_stage2.year == 0].year.count()

Let's remove them for the correlation.

In [None]:
plot_corr(df_stage2[df_stage2.year != 0][df_stage2.columns[0:-17]], "../datastory/figures/correlationComplete.png")

What do we see here ?

1) the value between spotify and MSD on the duration is highly correlated, same for loudness and key, so let's keep the original one 

2) track_popularity and song_hotttnesss is moderately correlated, thus we will keep the song_hotttnesss

3) year of MSD and album release from spotify is absolutely not correlated, so let's keep the one from MSD again

4) The acousticness has negative correlation with loudness and energy (which seems logical, as classical music is generally not loud or energetic)

5) no feature looks highly, even moderately, correlated to the song_hotttnesss

Let's drop the useless columns again

In [None]:
df_stage3 = df_stage2.drop(['album_release', 'track_popularity', 'duration_ms', 'key_y', 'loudness_y', 'tempo_x'], axis=1)
df_stage3.rename(columns={'key_x': 'key', 'loudness_x': 'loudness', 'tempo_y': 'tempo'}, inplace=True)

In [None]:
plot_corr(df_stage3[df_stage3.year != 0][df_stage3.columns[0:-17]], "../datastory/figures/correlationReduce.png")

And let's do the same thing for genre only.

In [None]:
plot_corr(pd.concat([df_stage3[['song_hotttnesss']],df_stage3[df_stage3.columns[-17:]]], axis=1), "../datastory/figures/correlationGenre.png")

We want know to check if we can train a regressor and export the features importance. Here we use an RandomForest like in HW4, but a regressor one this time.

In [None]:
from sklearn.externals import joblib
COMPUTE = True

if COMPUTE:
    
    regressor = RandomForestRegressor(n_jobs=-1, n_estimators = 400, verbose=True)

    df_stage4 = df_stage3.copy()
    df_stage4 = df_stage4.dropna(axis=0, how='any')
    df_stage4 = df_stage4[df_stage4.year > 0]
    df_stage4['nb_genre'] = np.sum(df_stage4.iloc[:, -17:].values, axis=1)  
    df_stage4 = df_stage4[df_stage4.nb_genre > 0].drop(['nb_genre'], axis=1)
    
    train_set = df_stage4.drop(['song_hotttnesss'], axis=1)  

    train_label = df_stage4.song_hotttnesss

    regressor.fit(train_set, train_label)
    
    #joblib.dump(regressor, 'completeRegressor.pkl')
    
else:
    
    regressor = joblib.load('completeRegressor.pkl') 


Let's check the importance per feature, in order to extract some useful insights. 

Note that we sum the importance of each genre to see what they mean for the whole random forest, as we can sum feature to get the importance of a bag of feature.

In [None]:
genre_sum = 0.0
for a, b in sorted(zip(regressor.feature_importances_, train_set.columns), reverse=True):
    print(a, " : ", b)
    if b in genres:
        genre_sum += a
        
genre_sum

Note that, with a sum of 0.07, the bagging of the genres has the same importance as the others.

It looks like we have some really interesting features, but let's create a graph to visualize it.

In [None]:
feature_importance_df = pd.DataFrame({'label': train_set.columns, 'feature_importance' : regressor.feature_importances_})
feature_importance_df = feature_importance_df.sort_values('feature_importance', ascending=False).iloc[:13]
feature_importance_df.iloc[12] = [genre_sum, 'genre']

order = np.flipud(feature_importance_df.sort_values(by='feature_importance').label)

In [None]:
sns.set(font_scale=2)
fig, ax = plt.subplots(figsize=(20,20))

snsplot = sns.barplot(x='feature_importance', y='label', data=feature_importance_df, palette="Blues_d", order=order)

snsplot.figure.savefig("../datastory/figures/RandomForest_feature_importance.png")

In [None]:
def plot_scatter(df, y_name, x_name):
    sns_plot = sns.lmplot(x=x_name, y=y_name, scatter_kws={"s": 2}, line_kws={'color': 'red'}, data=df)
    sns_plot.savefig("figures/scatter_feature/scatter_" + x_name + ".png")

In [None]:
df_with_correct_feature = df_stage3.dropna(axis=0, how='any').copy()

df_with_correct_year = df_stage3[df_stage3.year > 0].copy()

df_with_correct_genre = df_stage3.copy()
df_with_correct_genre['nb_genre'] = np.sum(df_with_correct_genre.iloc[:, -17:].values, axis=1)  
df_with_correct_genre = df_with_correct_genre[df_with_correct_genre.nb_genre > 0].drop(['nb_genre'], axis=1)
   
print("correct feature len: ", len(df_with_correct_feature))
print("correct year len: ", len(df_with_correct_year))
print("correct genre len: ", len(df_with_correct_genre))

In [None]:
for feature in df_with_correct_feature.columns[:-17]:
    if feature != "year" and feature != "song_hotttnesss":
        plot_scatter(df_with_correct_feature, "song_hotttnesss", feature)

In [None]:
plot_scatter(df_with_correct_year, "song_hotttnesss", "year")