In [1]:
from google.colab import files

uploaded = files.upload()
print("✅ Uploaded files:", list(uploaded.keys()))


Saving Spotify-2000.csv to Spotify-2000.csv
✅ Uploaded files: ['Spotify-2000.csv']


In [2]:
from pathlib import Path

# Try to find a likely CSV in the current working directory if you used files.upload()
candidates = sorted(Path(".").glob("*.csv"))
print("🔎 CSV candidates found:", [c.name for c in candidates])

# Pick the first likely match OR set manually if needed
MAIN_PATH = None
preferred_names = ["Spotify-2000.csv", "spotify-2000.csv", "Spotify2000.csv", "spotify.csv", "data.csv"]
for name in preferred_names:
    p = Path(name)
    if p.exists():
        MAIN_PATH = p
        break

if MAIN_PATH is None and candidates:
    MAIN_PATH = candidates[0]  # fallback to first CSV found

print("📄 Using CSV:", MAIN_PATH)
assert MAIN_PATH is not None and MAIN_PATH.exists(), "Couldn't find your CSV. Re-run Cell 1 or set MAIN_PATH manually."


🔎 CSV candidates found: ['Spotify-2000.csv']
📄 Using CSV: Spotify-2000.csv


In [3]:
import pandas as pd

data = pd.read_csv(MAIN_PATH, low_memory=False)
print("✅ Loaded shape:", data.shape)
display(data.head(5))


✅ Loaded shape: (1994, 15)


Unnamed: 0,Index,Title,Artist,Top Genre,Year,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
0,1,Sunrise,Norah Jones,adult standards,2004,157,30,53,-14,11,68,201,94,3,71
1,2,Black Night,Deep Purple,album rock,2000,135,79,50,-11,17,81,207,17,7,39
2,3,Clint Eastwood,Gorillaz,alternative hip hop,2001,168,69,66,-9,7,52,341,2,17,69
3,4,The Pretender,Foo Fighters,alternative metal,2007,173,96,43,-4,3,37,269,0,4,76
4,5,Waitin' On A Sunny Day,Bruce Springsteen,classic rock,2002,106,82,58,-5,10,87,256,1,3,59


In [4]:
print("🧭 Columns:", list(data.columns))
print("\n📊 dtypes:")
print(data.dtypes.head(30))


🧭 Columns: ['Index', 'Title', 'Artist', 'Top Genre', 'Year', 'Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)', 'Liveness', 'Valence', 'Length (Duration)', 'Acousticness', 'Speechiness', 'Popularity']

📊 dtypes:
Index                      int64
Title                     object
Artist                    object
Top Genre                 object
Year                       int64
Beats Per Minute (BPM)     int64
Energy                     int64
Danceability               int64
Loudness (dB)              int64
Liveness                   int64
Valence                    int64
Length (Duration)         object
Acousticness               int64
Speechiness                int64
Popularity                 int64
dtype: object


In [5]:
from pathlib import Path

# Heuristic column picks (case-insensitive matching)
def pick(cols, *cands):
    s = {c.lower(): c for c in cols}
    for cand in cands:
        if cand.lower() in s:
            return s[cand.lower()]
    return None

cols = list(data.columns)
artist_col = pick(cols, "Artist","Artists","artist_name","artist")
genre_col  = pick(cols, "Top Genre","top genre","Genre","genre")
year_col   = pick(cols, "Year","Release Year","release_year","released_year","release_date","yearReleased")

print("🎯 Selected columns →",
      f"artist_col={artist_col}, genre_col={genre_col}, year_col={year_col}")

# Build helper DataFrames (keep only necessary columns; drop duplicates)
genre_data  = (data[[c for c in [artist_col, genre_col] if c]] if genre_col else pd.DataFrame()).drop_duplicates()
year_data   = (data[[year_col]] if year_col else pd.DataFrame()).drop_duplicates()
artist_data = (data[[artist_col]] if artist_col else pd.DataFrame()).drop_duplicates()

# Save them (small files for the assignment)
if not genre_data.empty:  genre_data.to_csv("genre_data.csv", index=False)
if not year_data.empty:   year_data.to_csv("year_data.csv", index=False)
if not artist_data.empty: artist_data.to_csv("artist_data.csv", index=False)

print("💾 Saved: ",
      "genre_data.csv" if not genre_data.empty else "(no genre_data)",
      "|", "year_data.csv" if not year_data.empty else "(no year_data)",
      "|", "artist_data.csv" if not artist_data.empty else "(no artist_data)")


🎯 Selected columns → artist_col=Artist, genre_col=Top Genre, year_col=Year
💾 Saved:  genre_data.csv | year_data.csv | artist_data.csv


In [6]:
import pandas as pd
from pathlib import Path

genre_data  = pd.read_csv("genre_data.csv")  if Path("genre_data.csv").exists()  else pd.DataFrame()
year_data   = pd.read_csv("year_data.csv")   if Path("year_data.csv").exists()   else pd.DataFrame()
artist_data = pd.read_csv("artist_data.csv") if Path("artist_data.csv").exists() else pd.DataFrame()

print("🔁 Re-read helper CSVs.")
display(genre_data.head(2)  if not genre_data.empty  else "No genre_data")
display(year_data.head(2)   if not year_data.empty   else "No year_data")
display(artist_data.head(2) if not artist_data.empty else "No artist_data")


🔁 Re-read helper CSVs.


Unnamed: 0,Artist,Top Genre
0,Norah Jones,adult standards
1,Deep Purple,album rock


Unnamed: 0,Year
0,2004
1,2000


Unnamed: 0,Artist
0,Norah Jones
1,Deep Purple


In [7]:
print("ℹ️ data.info():")
data.info()

if not genre_data.empty:
    print("\nℹ️ genre_data.info():")
    genre_data.info()
else:
    print("\n(genre_data is empty; skipping info())")


ℹ️ data.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Index                   1994 non-null   int64 
 1   Title                   1994 non-null   object
 2   Artist                  1994 non-null   object
 3   Top Genre               1994 non-null   object
 4   Year                    1994 non-null   int64 
 5   Beats Per Minute (BPM)  1994 non-null   int64 
 6   Energy                  1994 non-null   int64 
 7   Danceability            1994 non-null   int64 
 8   Loudness (dB)           1994 non-null   int64 
 9   Liveness                1994 non-null   int64 
 10  Valence                 1994 non-null   int64 
 11  Length (Duration)       1994 non-null   object
 12  Acousticness            1994 non-null   int64 
 13  Speechiness             1994 non-null   int64 
 14  Popularity              1994 non-null   

In [8]:
import numpy as np
import pandas as pd

# If the chosen "year" column looks like a date, try to extract the year part
year_series = None
if year_col in data.columns:
    if pd.api.types.is_string_dtype(data[year_col]) and data[year_col].str.contains("-", na=False).any():
        # looks like "YYYY-MM-DD" → extract first 4 chars
        year_series = pd.to_numeric(data[year_col].str.slice(0,4), errors="coerce")
    else:
        year_series = pd.to_numeric(data[year_col], errors="coerce")

if year_series is not None:
    decade = (year_series // 10) * 10
    data["decade"] = decade.where(decade.notna(), pd.NA).astype("Int64").astype("string") + "s"
else:
    data["decade"] = pd.NA

print("✅ Added 'decade' column.")
display(data[[year_col, "decade"]].head(10) if year_col in data.columns else data[["decade"]].head(10))

print("\n📈 Decade distribution (top 10):")
print(data["decade"].value_counts(dropna=False).head(10))


✅ Added 'decade' column.


Unnamed: 0,Year,decade
0,2004,2000s
1,2000,2000s
2,2001,2000s
3,2007,2000s
4,2002,2000s
5,2004,2000s
6,2002,2000s
7,2006,2000s
8,2004,2000s
9,2002,2000s



📈 Decade distribution (top 10):
decade
2000s    400
2010s    399
1970s    353
1980s    344
1990s    331
1960s    158
1950s      9
Name: count, dtype: Int64


In [9]:
# Missing values summary
na_summary = data.isna().sum().sort_values(ascending=False).head(15)
print("🧹 Top missing-value columns:\n", na_summary)

# Basic numeric description for audio features if present
num_desc = data.select_dtypes(include=["number"]).describe().T
print("\n📐 Numeric summary (first 12 rows):")
display(num_desc.head(12))


🧹 Top missing-value columns:
 Index                     0
Title                     0
Artist                    0
Top Genre                 0
Year                      0
Beats Per Minute (BPM)    0
Energy                    0
Danceability              0
Loudness (dB)             0
Liveness                  0
Valence                   0
Length (Duration)         0
Acousticness              0
Speechiness               0
Popularity                0
dtype: int64

📐 Numeric summary (first 12 rows):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Index,1994.0,997.5,575.762538,1.0,499.25,997.5,1495.75,1994.0
Year,1994.0,1992.992979,16.116048,1956.0,1979.0,1993.0,2007.0,2019.0
Beats Per Minute (BPM),1994.0,120.215647,28.028096,37.0,99.0,119.0,136.0,206.0
Energy,1994.0,59.679539,22.154322,3.0,42.0,61.0,78.0,100.0
Danceability,1994.0,53.238215,15.351507,10.0,43.0,53.0,64.0,96.0
Loudness (dB),1994.0,-9.008526,3.647876,-27.0,-11.0,-8.0,-6.0,-2.0
Liveness,1994.0,19.012036,16.727378,2.0,9.0,12.0,23.0,99.0
Valence,1994.0,49.408726,24.858212,3.0,29.0,47.0,69.75,99.0
Acousticness,1994.0,28.858074,29.011986,0.0,3.0,18.0,50.0,99.0
Speechiness,1994.0,4.994985,4.401566,2.0,3.0,4.0,5.0,55.0
