# Datamining Project - Spotify Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

## Data understanding and preparation

### Dataset Import and simple data visualization

In [None]:
df_train = pd.read_csv('train.csv', skipinitialspace=True)

df_test = pd.read_csv('test.csv', skipinitialspace=True)

In [None]:
df_test.columns

In [None]:
df_train

In [None]:
df_train.shape

In [None]:
df_train.dtypes

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_mean = df_train.drop(columns=['name', 'explicit', 'artists','mode','popularity_confidence', 'album_name', 'features_duration_ms'], axis=1)
df_mean.groupby(['genre']).mean().T

### Checking for duplicated records

In [None]:
# Checking for duplicated records

print(df_train.duplicated().sum(), df_test.duplicated().sum())


In [None]:
# Checking for duplicated songs
print(df_train['name'].duplicated().sum(), df_test['name'].duplicated().sum())

In [None]:
# Checking differences between duration_ms and features_duration_ms, also there we can notice that most records differ by 1

values = df_train[df_train['duration_ms'] != df_train['features_duration_ms']]
columns_to_print = ['duration_ms', 'features_duration_ms']
values[columns_to_print]

valu = values['features_duration_ms'] - values['duration_ms']

print("Number of different records: ", valu.size)

differences_count = 0
for i in valu:
    if(i != 1 and i != -1):
        differences_count += 1
        
print("Number of different records with a difference greater than 1: ", differences_count)

### Checking for NaN values

In [None]:
for column in list(df_train.columns):
    containsNaN = df_train[column].isnull().sum()
    if(containsNaN):
        print("Column: " + column + " hasNaN: " + str(containsNaN))

In [None]:
for column in list(df_test.columns):
    containsNaN = df_test[column].isnull().sum()
    if(containsNaN):
        print("Column: " + column + " hasNaN: " + str(containsNaN))

In [None]:
df_test['popularity_confidence'].isnull().sum()

### Data distribution


In [None]:
df_train.groupby('genre')['popularity'].mean().sort_values().plot(kind='bar')
plt.ylabel('popularity')
plt.show()

In [None]:
df_train['genre'].value_counts().plot(kind='bar')
plt.show()

In [None]:
#Plotting the distribution of the features

fig = plt.figure (figsize = (10,10))
fig_dims = (4,4)

plt.subplot2grid(fig_dims, (0,0))
df_train['explicit'].value_counts().plot(kind='bar', title='Explicit and not explicit songs')

plt.subplot2grid(fig_dims, (0,1))
df_train['mode'].value_counts().plot(kind='bar', title='Major and minor songs')

plt.subplot2grid(fig_dims, (1,0))
df_train['acousticness'].hist()
plt.title('Acousticness')

plt.subplot2grid(fig_dims, (1,1))
df_train['danceability'].hist()
plt.title('Danceability')

plt.subplot2grid(fig_dims, (2,0))
df_train['energy'].hist()
plt.title('Energy')

plt.subplot2grid(fig_dims, (2,1))
df_train['instrumentalness'].hist()
plt.title('Instrumentalness')

plt.subplot2grid(fig_dims, (0,2))
df_train['liveness'].hist()
plt.title('Liveness')

plt.subplot2grid(fig_dims, (1,2))
df_train['loudness'].hist()
plt.title('Loudness')

plt.subplot2grid(fig_dims, (2,2))
df_train['speechiness'].hist()
plt.title('Speechiness')

plt.subplot2grid(fig_dims, (0,3))
df_train['tempo'].hist()
plt.title('Tempo')

plt.subplot2grid(fig_dims, (1,3))
df_train['valence'].hist()
plt.title('Valence')

plt.subplot2grid(fig_dims, (2,3))
df_train['popularity'].hist()
plt.title('Popularity')

plt.subplot2grid(fig_dims, (3,0))
df_train['key'].hist()
plt.title('Key')

plt.subplot2grid(fig_dims, (3,1))
df_train['time_signature'].hist()
plt.title('Time signature')

plt.subplot2grid(fig_dims, (3,2))
df_train['popularity_confidence'].hist()
plt.title('Popularity confidence')

plt.subplot2grid(fig_dims, (3,3))
df_train['duration_ms'].hist()
plt.title('Duration_ms histogram')

plt.tight_layout()
plt.show()

In [None]:
#Plotting the distribution of the n_beats feature
plt.hist(df_train['n_beats'], bins=[0, 200, 400, 600, 800, 1000, 1300, max(df_train['n_beats'])], edgecolor='black')
plt.title('Number of beats')


In [None]:
#Plotting the distribution of the n_bars feature
plt.hist(df_train['n_bars'], bins=[0,50, 100,150, 200, 250,300,400, max(df_train['n_bars'])], edgecolor='black')
plt.title('Number of bars')

In [None]:
#Normal distribution of numerical values
from scipy.stats import norm

for attribute in list(df_train.columns):
    if(df_train[attribute].dtype == np.float64 or df_train[attribute].dtype == np.int64):
        x = np.linspace(df_train[attribute].min(), df_train[attribute].max(), 1000)

        mu = df_train[attribute].mean()
        sigma = df_train[attribute].std()

        # Calculate the PDF
        pdf = norm.pdf(x, loc=mu, scale=sigma)

        plt.figure(figsize=(8, 6))
        plt.plot(x, pdf, 'r', label='PDF')
        plt.xlabel(attribute)
        plt.ylabel('PDF Value')
        plt.title('Normal Distribution of ' + attribute)
        plt.legend()
        plt.grid()
        plt.show()

### Genre splitting and analysis

In [None]:
len(list(df_train['genre'].unique()))

In [None]:
genresData = []
for genre in df_train['genre'].unique():
    genresData.append(df_train[df_train['genre'] == genre])
    
genresData[0].describe()

In [None]:
import pandas as pd
from scipy.stats import pearsonr

# Initialize empty lists to store information
genre_list = []
feature_1_list = []
feature_2_list = []
correlation_list = []

for genre_df in genresData:
    # Drop NaN values and unwanted columns
    genre_df = genre_df.dropna(axis=1)
    genre_df = genre_df.drop(columns=['name', 'explicit', 'artists', 'album_name', 'features_duration_ms'], axis=1)
    genre = genre_df.iloc[0]['genre']
    
    for feature_1 in list(genre_df.columns):
        if feature_1 != 'genre':
            for feature_2 in list(genre_df.columns):
                if feature_2 != 'genre' and feature_1 != feature_2:
                    corr, p_val = pearsonr(genre_df[feature_1], genre_df[feature_2])
                    if abs(corr) > 0.7:  # Filter based on correlation threshold
                        if not feature_1 in feature_2_list or not feature_2 in feature_1_list:
                            genre_list.append(genre)
                            feature_1_list.append(feature_1)
                            feature_2_list.append(feature_2)
                            correlation_list.append(corr)

# Create a DataFrame from the lists
correlation_df = pd.DataFrame({
    'genre': genre_list,
    'feature_1': feature_1_list,
    'feature_2': feature_2_list,
    'correlation': correlation_list
})

correlation_df


In [None]:
features_list = list(set(feature_1_list)) + list(set(feature_2_list))
features_list

In [None]:

df_scatter_matrix = df_train.drop(columns=['name', 'artists', 'genre', 'explicit','album_name'])

    
    
pd.plotting.scatter_matrix(df_scatter_matrix[list(set(features_list))], figsize=(16, 8))
plt.show()



### Data correlation

In [None]:
##Scatter plot of 'duration_ms' and 'feature_duration_ms'
plt.scatter(df_train['duration_ms'], df_train['features_duration_ms'])
plt.xlabel('duration_ms')
plt.ylabel('feature_duration_ms')
plt.title('Scatter plot of duration_ms and feature_duration_ms')

In [None]:
#Pearson correlation heatmap
import seaborn as sns
df_mean = df_train.drop(columns=['name', 'explicit', 'artists','mode','popularity_confidence', 'album_name', 'genre'], axis=1)

# Compute the correlation matrix and make it larger
plt.figure(figsize=(12, 10))
corr = df_mean.corr()
sns.heatmap(corr, annot=True)
plt.show()