In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

## Dataset Import and simple statistics visualization

In [None]:
df_train = pd.read_csv('project dataset (missing + split)/train.csv', skipinitialspace=True)

df_test = pd.read_csv('project dataset (missing + split)/test.csv', skipinitialspace=True)

In [None]:
df_test.columns

In [None]:
df_train

In [None]:
df_train.shape

In [None]:
df_train.dtypes

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_mean = df_train.drop(columns=['name', 'explicit', 'artists','mode','popularity_confidence', 'album_name', 'features_duration_ms'], axis=1)
df_mean.groupby(['genre']).mean().T

# Checking for duplicates

In [None]:
# Checking for duplicated records

print(df_train.duplicated().sum(), df_test.duplicated().sum())


In [None]:
# Checking for duplicated songs
print(df_train['name'].duplicated().sum(), df_test['name'].duplicated().sum())

In [None]:
# Checking differences between duration_ms and features_duration_ms

values = df_train[df_train['duration_ms'] != df_train['features_duration_ms']]
columns_to_print = ['duration_ms', 'features_duration_ms']
values[columns_to_print]


valu = values['features_duration_ms'] - values['duration_ms']

print("Number of different records: ", valu.size)

differences_count = 0
for i in valu:
    if(i != 1 and i != -1):
        differences_count += 1
        
print("Number of different records with a difference greater than 1: ", differences_count)

## Dropping useless features

In [None]:
# Dropping features_duration_ms column since we are going to take in consideration only duration_ms

df_train.drop('features_duration_ms', axis=1)
df_test.drop('features_duration_ms', axis=1)

## Data Distribution

In [None]:
df_train.groupby('genre')['popularity'].mean().sort_values().plot(kind='bar')
plt.ylabel('popularity')
plt.show()

In [None]:
df_train['genre'].value_counts().plot(kind='bar')
plt.show()

In [None]:
df_train['duration_min'] = df_train['duration_ms'] / 60000

plt.hist(df_train['duration_min'], bins=50)
plt.xlabel('sepal length')
plt.ylabel('petal length')
plt.show()



## Checking for NaN values

In [None]:
for column in list(df_train.columns):
    containsNaN = df_train[column].isnull().sum()
    if(containsNaN):
        print("Column: " + column + " hasNaN: " + str(containsNaN))

In [None]:
for column in list(df_test.columns):
    containsNaN = df_test[column].isnull().sum()
    if(containsNaN):
        print("Column: " + column + " hasNaN: " + str(containsNaN))

In [None]:
df_test['popularity_confidence'].isnull().sum()

## Genre splitting and analysis

In [None]:
len(list(df_train['genre'].unique()))

In [None]:
genresData = []
for genre in df_train['genre'].unique():
    genresData.append(df_train[df_train['genre'] == genre])
    
genresData[0].describe()

In [None]:
import pandas as pd
from scipy.stats import pearsonr

# Initialize empty lists to store information
genre_list = []
feature_1_list = []
feature_2_list = []
correlation_list = []

for genre_df in genresData:
    # Drop NaN values and unwanted columns
    genre_df = genre_df.dropna(axis=1)
    genre_df = genre_df.drop(columns=['name', 'explicit', 'artists', 'album_name', 'features_duration_ms'], axis=1)
    genre = genre_df.iloc[0]['genre']
    
    for feature_1 in list(genre_df.columns):
        if feature_1 != 'genre':
            for feature_2 in list(genre_df.columns):
                if feature_2 != 'genre' and feature_1 != feature_2:
                    corr, p_val = pearsonr(genre_df[feature_1], genre_df[feature_2])
                    if abs(corr) > 0.7:  # Filter based on correlation threshold
                        if not feature_1 in feature_2_list or not feature_2 in feature_1_list:
                            genre_list.append(genre)
                            feature_1_list.append(feature_1)
                            feature_2_list.append(feature_2)
                            correlation_list.append(corr)

# Create a DataFrame from the lists
correlation_df = pd.DataFrame({
    'genre': genre_list,
    'feature_1': feature_1_list,
    'feature_2': feature_2_list,
    'correlation': correlation_list
})

correlation_df


In [None]:
features_list = list(set(feature_1_list)) + list(set(feature_2_list))
features_list

In [None]:

df_scatter_matrix = df_train.drop(columns=['name', 'artists', 'genre', 'explicit','album_name'])

    
    
pd.plotting.scatter_matrix(df_scatter_matrix[list(set(features_list))], figsize=(16, 8))
plt.show()



In [None]:
import seaborn as sns

In [None]:
sns.pairplot(correlation_df,hue='genre')

In [None]:
df_train