In [None]:
import pandas as pd
df = pd.read_csv('./dataset.csv')
df.head()

In [None]:
df.describe(include = 'all')

In [None]:
print(df.isnull().sum())

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Create subplots
fig, ax = plt.subplots(2, 1, figsize=(15, 10))

# Plot histogram for popularity
ax[0].hist(df['popularity'], bins=30, edgecolor='black', alpha=0.7)
ax[0].set_title('Histogram of Popularity')
ax[0].set_xlabel('Popularity')
ax[0].set_ylabel('Frequency')

# Plot density plot for popularity using Gaussian Kernel Density Estimate
from scipy.stats import gaussian_kde
density = gaussian_kde(df['popularity'])
x_vals = np.linspace(min(df['popularity']), max(df['popularity']), 1000)
y_vals = density(x_vals)
ax[1].plot(x_vals, y_vals, color='blue')
ax[1].set_title('Density Plot of Popularity')
ax[1].set_xlabel('Popularity')
ax[1].set_ylabel('Density')

# Show the plots
plt.tight_layout()
plt.show()

In [None]:
# Create a box plot for popularity
plt.figure(figsize=(10, 5))
plt.boxplot(df['popularity'], vert=False)
plt.title('Box Plot of Popularity')
plt.xlabel('Popularity')
plt.show()

In [None]:
# Compute the correlation matrix for numerical features

numerical_df = df.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numerical_df.corr()


# Display the correlation with 'popularity'
correlation_with_popularity = correlation_matrix['popularity'].sort_values(ascending=False)
correlation_with_popularity

In [None]:
import seaborn as sns

# Filter the DataFrame for explicit and non-explicit songs
explicit_songs = df[df['explicit'] == True]
non_explicit_songs = df[df['explicit'] == False]

# Create subplots
fig, ax = plt.subplots(2, 1, figsize=(10, 8))

# Plot histogram for explicit songs' popularity
sns.histplot(explicit_songs['popularity'], ax=ax[0], kde=True, color='blue', label='Explicit')
ax[0].set_title('Distribution of Popularity for Explicit Songs')
ax[0].set_xlabel('Popularity')
ax[0].set_ylabel('Density')

# Plot histogram for non-explicit songs' popularity
sns.histplot(non_explicit_songs['popularity'], ax=ax[1], kde=True, color='green', label='Non-Explicit')
ax[1].set_title('Distribution of Popularity for Non-Explicit Songs')
ax[1].set_xlabel('Popularity')
ax[1].set_ylabel('Density')

# Add legend
ax[0].legend()
ax[1].legend()

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
import matplotlib.pyplot as plt


average_values = df.groupby('track_genre')[['danceability', 'energy']].mean()
plt.figure(figsize=(10, 6))  
average_values.plot(kind='line')
plt.title('Average Danceability and Energy by Track Genre')
plt.xlabel('Track Genre')
plt.ylabel('Average Value')
plt.tight_layout()
plt.show()

In [None]:
# Print the number of samples and the number of features
num_samples, num_features = df.shape
print(f"The dataset contains {num_samples} samples and {num_features} features.")

# Check the data types of the features
print("\nData types of the features:")
print(df.dtypes)

# Check for missing values
print("\nNumber of missing values in each feature:")
print(df.isna().sum())

# Check the distribution of genres
print("\nDistribution of genres:")
print(df['track_genre'].value_counts())

# Check the distribution of popularity scores
print("\nSummary statistics of popularity scores:")
print(df['popularity'].describe())

In [None]:
# Set the style of the plots
sns.set(style="whitegrid")

# Plot histograms for the numerical features
numerical_features = ['popularity', 'duration_ms', 'danceability', 'energy', 'key', 
                      'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
                      'liveness', 'valence', 'tempo']
df[numerical_features].hist(bins=30, figsize=(20, 15))
plt.tight_layout()
plt.show()

# Plot a bar plot for the 'explicit' feature
plt.figure(figsize=(6, 6))
sns.countplot(x='explicit', data=df)
plt.title('Distribution of Explicit Lyrics')
plt.show()

# Plot a bar plot for the 'mode' feature
plt.figure(figsize=(6, 6))
sns.countplot(x='mode', data=df)
plt.title('Distribution of Modes')
plt.show()

# Plot a bar plot for the 'time_signature' feature
plt.figure(figsize=(6, 6))
sns.countplot(x='time_signature', data=df)
plt.title('Distribution of Time Signatures')
plt.show()

# Plot a correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df[numerical_features].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()