# Importing libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Load

In [6]:
Spotify_data = pd.read_csv(r"C:\Users\meers\Downloads\archive\spotify_history.csv")

In [None]:
Spotify_data

# Check Structure

In [None]:
Spotify_data.info()

In [None]:
Spotify_data.describe()

In [None]:
Spotify_data.head()

# Cleaning Data

Handle missing values.

Convert timestamps to datetime.

Create new time-based features (year, month, day, hour).

Convert ms_played to minutes/seconds.

Remove extremely short plays less than 15 seconds to filter out accidental clicks.

In [None]:
#Convert timestep to Datetime

Spotify_data['ts'] = pd.to_datetime(Spotify_data['ts'])

Spotify_data['ts_year'] = Spotify_data['ts'].dt.year
Spotify_data['ts_month'] = Spotify_data['ts'].dt.month_name()
Spotify_data['ts_day'] = Spotify_data['ts'].dt.day
Spotify_data['ts_hour'] = Spotify_data['ts'].dt.hour



In [None]:
# Filter, convert to seconds, and format in one go
Spotify_data = (
    Spotify_data.assign(duration_seconds=Spotify_data['ms_played'] / 1000)
                .loc[lambda df: df['duration_seconds'] >= 15]
                .assign(minute_and_seconds=lambda df: df['duration_seconds'].apply(
                    lambda x: f"{int(x // 60)}:{int(x % 60):02d}"
                ))
)


Spotify_data

In [None]:
#Removing uncessary columns

Spotify_data = Spotify_data.drop(columns=['ts', 'ms_played'])


In [None]:
Spotify_data

# Exploratory Data Analysis

Questions to answer:

1. Top artists & tracks by total minutes played.

2. Listening habits by hour.

3. Platform usage (web player, mobile, etc.).

4. Skips behavior – which songs or artists are skipped most.

5. Album listening trends.


In [None]:
# Top artists & tracks by total minutes played.

top_artists = Spotify_data.groupby('artist_name')['duration_seconds'].sum().sort_values(ascending=False).head(10)
top_artists

#Based on the results shown below, we can conclude that the user's most-listened-to artist is The Beatles, as they have the highest total listening time in seconds.

In [None]:
# Listening habits by hour.

hourly_listening = Spotify_data.groupby('ts_hour')['duration_seconds'].sum()
hourly_listening

# Based on the results below, we can conclude that the user listens to most of their music between 6:00 PM and 7:00 PM.

In [None]:
# Platform usage (web player, mobile, etc.).

Platform_usage =  Spotify_data.groupby('platform')['platform'].count()

Platform_usage

# Based on the information, the user's preferred platform for listening to music is Android.

In [None]:
# Skips behavior – which songs or artists are skipped most.

skips = Spotify_data[Spotify_data['skipped'] == True].groupby('artist_name').size().sort_values(ascending=False).head(10)
skips
# We can conclude that The Beatles had the highest number of skips, with a total of 70.

In [None]:
#Album listening trends.

Top_albums = Spotify_data.groupby('album_name')['duration_seconds'].sum().sort_values(ascending=False).head(10)
Top_albums
 
# The user's most-listened-to album is The New Abnormal, as it has the highest total listening time as shown below.

# Data Visualization in Python

1. Bar chart: Top 10 Artists.

2. Heatmap: Listening by day of week & hour.

3. Pie chart: Platform usage.

4. Bar Chart: Top ten most skip songs

5. Line Graph: Album listening trends

In [None]:
# Bar chart: Top 10 Artists.
sns.barplot(x=top_artists.values, y=top_artists.index)
plt.title('Top 10 Artists')
plt.xlabel('Seconds play')
plt.ylabel('Artist Name')
plt.savefig('Top_Artists.png')

In [None]:
# Heatmap: Listening by day of week & hour.
# Step 1: Convert Series to DataFrame
hourly_df = hourly_listening.reset_index()
hourly_df.columns = ['Hour', 'Total_Seconds']

# Step 2: Set Hour as index (hours will be rows)
heatmap_data = hourly_df.set_index('Hour')

# Step 3: Plot vertical heatmap
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap="YlGnBu", cbar_kws={'label': 'Seconds Listened'})
plt.title("Listening Time by Hour")
plt.ylabel("Hour of Day")
plt.xlabel("Total Listening Seconds")
plt.savefig("Listening_Time_by_Hour_Heatmap.png")


In [None]:
# Pie chart: Platform usage.
plt.pie(
    Platform_usage,                 # data values
    labels=Platform_usage.index,    # platform names
    colors=['#66b3ff','#99ff99','#ffcc99','#ff9999']
)
plt.title('Platform usage')
plt.savefig('Platform_Usage.png')


In [None]:
# Bar Chart: Top ten most skip songs
sns.barplot(x=skips.values, y=skips.index)
plt.title('Top 10 skip Artists')
plt.xlabel('Amount of skips')
plt.ylabel('Artist Name')
plt.savefig('Most_skip_artist.png')


In [None]:
# Line Graph: Album listening trends
Albums_df = Top_albums.reset_index()
plt.figure(figsize=(12,6))  # make the plot wider
sns.lineplot(data=Albums_df, x="album_name", y="duration_seconds", marker='o')
plt.xticks(rotation=45, ha='right')  # rotate labels and align them
plt.xlabel('Album Name')
plt.ylabel('Total Listening Time (seconds)')
plt.title('Top 10 Albums by Listening Time')
plt.tight_layout()  # adjust layout so nothing gets cut off
plt.savefig('Top_10_Albums_by_listening_time.png')

# Export the Dataframe

In [None]:
Spotify_data.to_csv('clean_Spotify_data.excel', index=False)
