In [1]:
from tqdm import tqdm
from pandas.plotting import scatter_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./data/spotify_personal_kaggle.csv')

In [3]:
df.dtypes

Artist               object
Track Name           object
Like                float64
Danceability        float64
Energy              float64
Key                   int64
Loudness            float64
Mode                  int64
Speechiness         float64
Acousticness        float64
Instrumentalness    float64
Liveness            float64
Valence             float64
Tempo               float64
Duration_ms           int64
Year                  int64
Popularity            int64
Explicit              int64
dtype: object

In [4]:
# Separate the data into liked and not liked
liked_tracks = df[df['Like'] == 1]
not_liked_tracks = df[df['Like'] == 0]
print(liked_tracks.shape[0], not_liked_tracks.shape[0])

2813 2813


In [5]:
liked_tracks.describe()

Unnamed: 0,Like,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_ms,Year,Popularity,Explicit
count,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0
mean,1.0,0.59965,0.615434,5.196232,-8.551616,0.678635,0.134825,0.288913,0.086546,0.208527,0.507857,118.399158,245123.7,2006.656594,48.53786,0.298258
std,0.0,0.170855,0.232679,3.657728,5.444498,0.467084,0.143219,0.301718,0.245938,0.185288,0.242163,30.870459,96891.55,14.822207,19.053615,0.457575
min,1.0,0.0,0.00189,0.0,-39.219,0.0,0.0,6e-06,0.0,0.0185,0.0,0.0,28643.0,1956.0,0.0,0.0
25%,1.0,0.485,0.481,1.0,-9.965,0.0,0.0389,0.0374,0.0,0.0959,0.322,93.817,190514.0,2000.0,37.0,0.0
50%,1.0,0.608,0.647,5.0,-7.115,1.0,0.0638,0.178,2e-06,0.129,0.512,115.049,228293.0,2013.0,50.0,0.0
75%,1.0,0.728,0.792,8.0,-5.237,1.0,0.196,0.468,0.000795,0.262,0.696,140.088,278067.0,2018.0,63.0,1.0
max,1.0,0.985,0.999,11.0,4.638,1.0,0.946,0.996,0.996,0.991,0.989,244.091,1398240.0,2020.0,99.0,1.0


In [6]:
not_liked_tracks.describe()

Unnamed: 0,Like,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_ms,Year,Popularity,Explicit
count,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0
mean,0.0,0.542208,0.483556,5.248489,-11.503354,0.70423,0.096626,0.495436,0.164734,0.205976,0.526585,116.722717,228292.8,1977.172769,31.855315,0.090295
std,0.0,0.180529,0.267048,3.504889,5.684588,0.456469,0.157697,0.377869,0.312558,0.176137,0.263368,30.459917,130757.2,26.21622,22.091641,0.286655
min,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5108.0,1921.0,0.0,0.0
25%,0.0,0.419,0.255,2.0,-14.823,0.0,0.0346,0.0938,0.0,0.0984,0.311,93.861,168320.0,1956.0,12.0,0.0
50%,0.0,0.553,0.479,5.0,-10.532,1.0,0.0446,0.501,0.000216,0.134,0.538,115.397,205840.0,1978.0,34.0,0.0
75%,0.0,0.679,0.698,8.0,-7.204,1.0,0.0744,0.89,0.0977,0.263,0.735,135.735,259613.0,1999.0,49.0,0.0
max,0.0,0.976,1.0,11.0,-0.826,1.0,0.967,0.996,0.999,0.987,0.996,214.817,4120258.0,2020.0,91.0,1.0


In [7]:
# Display summary statistics
summary_stats = df.describe()

In [8]:
# Define the features to analyze
features = [
    'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness',
    'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',
    'Duration_ms', 'Year', 'Popularity', 'Explicit'
]

In [9]:
# Plot the distribution of each feature
plots = {}
for feature in tqdm(features):
    plt.figure(figsize=(14, 6))
    
    # Plot liked tracks
    sns.histplot(liked_tracks[feature], color='blue', kde=True, label='Liked', stat="density", bins=30, element="step")
    
    # Plot not liked tracks
    sns.histplot(not_liked_tracks[feature], color='red', kde=True, label='Not Liked', stat="density", bins=30, element="step")
    
    plt.title(f'Distribution of {feature.capitalize()}')
    plt.xlabel(feature.capitalize())
    plt.ylabel('Density')
    plt.legend()
    plot_file = f"./eda/{feature}_distribution.png"
    plt.savefig(plot_file)
    plots[feature] = plot_file
    plt.close()
    


100%|██████████| 15/15 [00:06<00:00,  2.39it/s]


In [10]:
# Pairplot to see the relationships between features in liked and not liked tracks
pairplot_file = "./eda/pairplot_features.png"
sns.pairplot(df, hue='Like', vars=features, palette={1: "blue", 0: "red"})
plt.savefig(pairplot_file)
plt.close()

In [11]:
# Calculate the correlation matrix
correlation_matrix = df[features].corr()

# Select the top correlated features (absolute correlation greater than 0.5)
top_correlations = correlation_matrix[abs(correlation_matrix) > 0.5]

# Plot the correlation matrix
plt.figure(figsize=(14, 10))
sns.heatmap(top_correlations, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1, cbar=True)
plt.title("Top Features Correlations")
correlation_plot_file = "./eda/top_features_correlations.png"
plt.savefig(correlation_plot_file)
plt.close()

In [12]:
# Plot scatter matrix
scatter_matrix_file = "./eda/scatter_matrix.png"
plt.figure(figsize=(15, 15))
scatter_matrix(df[features], alpha=0.2, figsize=(15, 15), diagonal='kde')
plt.suptitle('Scatter Matrix of All Features')
plt.savefig(scatter_matrix_file)
plt.close()

<Figure size 1500x1500 with 0 Axes>

In [13]:
# Function to detect outliers using IQR
def detect_outliers(df, features):
    outliers = pd.DataFrame()
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        outlier_mask = (df[feature] < (Q1 - 1.5 * IQR)) | (df[feature] > (Q3 + 1.5 * IQR))
        outliers = pd.concat([outliers, df[outlier_mask]], axis=0)
    return outliers.drop_duplicates()

# Detect outliers
outliers = detect_outliers(df, features)

# Highlight notable outliers in the scatter matrix
plt.figure(figsize=(15, 15))
scatter_matrix(df[features], alpha=0.2, figsize=(15, 15), diagonal='kde')

# Overlay outliers
for i, feature1 in enumerate(features):
    for j, feature2 in enumerate(features):
        if i != j:
            plt.subplot(len(features), len(features), i * len(features) + j + 1)
            plt.scatter(outliers[feature1], outliers[feature2], edgecolor='red', facecolors='none')

plt.suptitle('Scatter Matrix of All Features with Outliers Highlighted')
highlighted_scatter_matrix_file = "./eda/scatter_matrix_with_outliers.png"
plt.savefig(highlighted_scatter_matrix_file)
plt.close()

<Figure size 1500x1500 with 0 Axes>