# Spotify Dataset Exploration

This notebook performs initial data inspection on the Spotify datasets, including loading, examining structure, checking for missing values, and identifying duplicates.

## Import Required Libraries

Import necessary libraries for data analysis and exploration.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style("whitegrid")

  from scipy.stats import gaussian_kde


## Load Dataset Using Pandas

Load the CSV datasets from the archive folder.

In [3]:
# Define archive folder path
archive_path = Path('../archive')

# Load tracks dataset
tracks_df = pd.read_csv(archive_path / 'tracks.csv')
print("Tracks dataset loaded successfully!")
print(f"Shape: {tracks_df.shape}\n")
print("First few rows of tracks dataset:")
tracks_df.head()

Tracks dataset loaded successfully!
Shape: (586672, 20)

First few rows of tracks dataset:


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [None]:
# Load artists dataset
artists_df = pd.read_csv(archive_path / 'artists.csv')
print("Artists dataset loaded successfully!")
print(f"Shape: {artists_df.shape}\n")
print("First few rows of artists dataset:")
print(artists_df.head())

## Inspect Columns and Data Types

Examine the structure of the datasets, including column names and their data types.

In [None]:
print("=" * 80)
print("TRACKS DATASET STRUCTURE")
print("=" * 80)
print("\nData Types:")
print(tracks_df.dtypes)
print("\nDataset Info:")
tracks_df.info()s
print("\nColumn Summary:")
print(f"Total Columns: {len(tracks_df.columns)}")
print(f"\nColumns: {list(tracks_df.columns)}")

In [None]:
print("\n" + "=" * 80)
print("ARTISTS DATASET STRUCTURE")
print("=" * 80)
print("\nData Types:")
print(artists_df.dtypes)
print("\nDataset Info:")
artists_df.info()
print("\nColumn Summary:")
print(f"Total Columns: {len(artists_df.columns)}")
print(f"\nColumns: {list(artists_df.columns)}")

## Check Dataset Size

Display the shape and dimensions of both datasets.

In [None]:
print("=" * 80)
print("DATASET DIMENSIONS")
print("=" * 80)
print(f"\nTracks Dataset:")
print(f"  - Rows: {tracks_df.shape[0]:,}")
print(f"  - Columns: {tracks_df.shape[1]}")
print(f"  - Total cells: {tracks_df.shape[0] * tracks_df.shape[1]:,}")

print(f"\nArtists Dataset:")
print(f"  - Rows: {artists_df.shape[0]:,}")
print(f"  - Columns: {artists_df.shape[1]}")
print(f"  - Total cells: {artists_df.shape[0] * artists_df.shape[1]:,}")

print("\n" + "=" * 80)

## Identify Missing Values

Check for and visualize missing values in both datasets.

In [None]:
print("=" * 80)
print("MISSING VALUES - TRACKS DATASET")
print("=" * 80)

missing_tracks = tracks_df.isnull().sum()
missing_tracks_pct = (missing_tracks / len(tracks_df)) * 100

missing_tracks_df = pd.DataFrame({
    'Column': missing_tracks.index,
    'Missing Count': missing_tracks.values,
    'Percentage': missing_tracks_pct.values
})
missing_tracks_df = missing_tracks_df[missing_tracks_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_tracks_df) == 0:
    print("\nNo missing values found in tracks dataset!")
else:
    print("\nMissing Values Summary:")
    print(missing_tracks_df.to_string(index=False))

# Visualize missing values
fig, ax = plt.subplots(figsize=(10, 6))
missing_counts = tracks_df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)
if len(missing_counts) > 0:
    missing_counts.plot(kind='barh', ax=ax, color='coral')
    ax.set_xlabel('Number of Missing Values')
    ax.set_title('Missing Values in Tracks Dataset')
    plt.tight_layout()
    plt.show()
else:
    print("\nNo missing values to visualize.")

In [None]:
print("\n" + "=" * 80)
print("MISSING VALUES - ARTISTS DATASET")
print("=" * 80)

missing_artists = artists_df.isnull().sum()
missing_artists_pct = (missing_artists / len(artists_df)) * 100

missing_artists_df = pd.DataFrame({
    'Column': missing_artists.index,
    'Missing Count': missing_artists.values,
    'Percentage': missing_artists_pct.values
})
missing_artists_df = missing_artists_df[missing_artists_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_artists_df) == 0:
    print("\nNo missing values found in artists dataset!")
else:
    print("\nMissing Values Summary:")
    print(missing_artists_df.to_string(index=False))

# Visualize missing values
fig, ax = plt.subplots(figsize=(10, 6))
missing_counts = artists_df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)
if len(missing_counts) > 0:
    missing_counts.plot(kind='barh', ax=ax, color='lightblue')
    ax.set_xlabel('Number of Missing Values')
    ax.set_title('Missing Values in Artists Dataset')
    plt.tight_layout()
    plt.show()
else:
    print("\nNo missing values to visualize.")

## Identify Duplicate Rows

Check for duplicate rows in both datasets.

In [None]:
print("=" * 80)
print("DUPLICATE ROWS - TRACKS DATASET")
print("=" * 80)

# Check for complete duplicate rows
duplicate_rows_tracks = tracks_df.duplicated().sum()
print(f"\nTotal completely duplicate rows: {duplicate_rows_tracks}")

# Check for duplicates based on first column (usually ID)
if len(tracks_df.columns) > 0:
    first_col = tracks_df.columns[0]
    duplicate_ids = tracks_df[first_col].duplicated().sum()
    print(f"Duplicate values in '{first_col}': {duplicate_ids}")

if duplicate_rows_tracks > 0:
    print("\nSample of duplicate rows:")
    duplicate_mask = tracks_df.duplicated(keep=False)
    print(tracks_df[duplicate_mask].sort_values(by=list(tracks_df.columns[:2])).head(10))
else:
    print("\nNo complete duplicate rows found.")

In [None]:
print("\n" + "=" * 80)
print("DUPLICATE ROWS - ARTISTS DATASET")
print("=" * 80)

# Check for complete duplicate rows
duplicate_rows_artists = artists_df.duplicated().sum()
print(f"\nTotal completely duplicate rows: {duplicate_rows_artists}")

# Check for duplicates based on first column (usually ID)
if len(artists_df.columns) > 0:
    first_col = artists_df.columns[0]
    duplicate_ids = artists_df[first_col].duplicated().sum()
    print(f"Duplicate values in '{first_col}': {duplicate_ids}")

if duplicate_rows_artists > 0:
    print("\nSample of duplicate rows:")
    duplicate_mask = artists_df.duplicated(keep=False)
    print(artists_df[duplicate_mask].sort_values(by=list(artists_df.columns[:2])).head(10))
else:
    print("\nNo complete duplicate rows found.")

print("\n" + "=" * 80)

## Summary

The dataset exploration is complete. Both datasets have been loaded and thoroughly analyzed for:
- **Dataset Structure**: Column names and data types
- **Dataset Size**: Number of rows and columns
- **Missing Values**: Count and percentage of missing data per column
- **Duplicate Rows**: Identification of complete and partial duplicates

This exploration provides a foundation for data cleaning and preprocessing steps in the next phases of the project.