<a href="https://colab.research.google.com/github/Anchamp/AI-/blob/main/Spotify_Music_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import kagglehub
solomonameh_spotify_music_dataset_path = kagglehub.dataset_download('solomonameh/spotify-music-dataset')

print('Data source import complete.')


## Import necessary libraries and load the datasets

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Ignore warnings (optional)
import warnings
warnings.filterwarnings('ignore')


# Load the datasets
low_popularity = pd.read_csv('/kaggle/input/spotify-music-dataset/low_popularity_spotify_data.csv')
high_popularity = pd.read_csv('/kaggle/input/spotify-music-dataset/high_popularity_spotify_data.csv')

#### Add the "Popularity_Type" column to each dataset

In [None]:
low_popularity['Popularity_Type'] = 'Low'
high_popularity['Popularity_Type'] = 'High'

#### Combine the datasets

In [None]:
combined_df = pd.concat([low_popularity, high_popularity], ignore_index=True)

In [None]:
 # Check the combined dataset
print(combined_df.shape)
print(combined_df.columns)
print(combined_df['Popularity_Type'].value_counts())

In [None]:
print(combined_df.info())

In [None]:
# Display the first few rows of the data
combined_df.head()

In [None]:
# Description of the Dataset
combined_df.describe()

## Data Cleaning and Preprocessing

In [None]:
# Check for missing values in each column
combined_df.isnull().sum()

In [None]:
# Handle missing values
combined_df = combined_df.dropna()

# Convert necessary columns to appropriate data types
combined_df['track_album_release_date'] = pd.to_datetime(combined_df['track_album_release_date'], errors='ignore')

In [None]:
print(combined_df['track_album_release_date'].head(22))

It seems some elements in 'track_album_release_date' are not in regular date format. To fix the inconsistent date formats in the 'track_album_release_date' column, you can use a custom parsing function that handles both full dates and years. Here's how you can modify the code to address this issue:

In [None]:
from datetime import datetime

def parse_date(date_string):
    try:
        # First, try parsing as a full date
        return pd.to_datetime(date_string)
    except ValueError:
        try:
            # If that fails, try parsing as just a year
            return pd.to_datetime(f"{date_string}-01-01")
        except ValueError:
            # If both fail, return NaT (Not a Time)
            return pd.NaT

In [None]:
# Apply the custom parsing function
combined_df['track_album_release_date'] = combined_df['track_album_release_date'].apply(parse_date)

# Check the result
print(combined_df['track_album_release_date'].head(25))
print(combined_df['track_album_release_date'].dtype)

In [None]:
# Drop irrelevant columns (e.g., URLs, IDs)
irrelevant_columns = ['uri', 'track_href', 'analysis_url', 'id']
combined_df.drop(columns=irrelevant_columns, inplace=True)

# Verify cleaned data
print(combined_df.info())

## Exploratory Data Analysis (EDA) and Visualization

#### Correlation Heatmap

In [None]:
# Correlation heatmap for numeric features
numeric_cols = combined_df.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(12, 10))
sns.heatmap(combined_df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

#### Feature Distributions

In [None]:
# Distribution of audio features by popularity type
audio_features = ['danceability', 'energy', 'valence', 'tempo', 'loudness', 'speechiness', 'acousticness', 'instrumentalness']
for feature in audio_features:
    plt.figure(figsize=(8, 4))
    sns.kdeplot(data=combined_df, x=feature, hue='Popularity_Type', fill=True)
    plt.title(f'Distribution of {feature} by Popularity Type')
    plt.show()

#### Temporal Analysis

In [None]:
# Popularity trends over time
combined_df['year'] = combined_df['track_album_release_date'].dt.year
popularity_trend = combined_df.groupby(['year', 'Popularity_Type'])['track_popularity'].mean().reset_index()

plt.figure(figsize=(12, 6))
sns.lineplot(data=popularity_trend, x='year', y='track_popularity', hue='Popularity_Type')
plt.title('Average Popularity Over Time')
plt.show()


In [None]:
# Group data by year, playlist genre, and popularity type to calculate average popularity
popularity_trends = (
    combined_df.groupby(['year', 'playlist_genre', 'Popularity_Type'])['track_popularity']
    .mean()
    .reset_index()
)

# Plot popularity trends
plt.figure(figsize=(14, 10))
sns.lineplot(
    data=popularity_trends,
    x='year',
    y='track_popularity',
    hue='playlist_genre',
    style='Popularity_Type',
    markers=True,
    dashes=True,
    palette='tab20'
)

# Add title and labels
plt.title('Popularity Trends by Playlist Genre and Popularity Type Over Time', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Popularity', fontsize=14)
plt.legend(title='Playlist Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Group by year and genre, calculate average popularity
genre_popularity = (combined_df.groupby(['year', 'playlist_genre'])['track_popularity'].mean().reset_index())

# Find the most and least popular genre per year
most_popular_genres = genre_popularity.loc[genre_popularity.groupby('year')['track_popularity'].idxmax()]
least_popular_genres = genre_popularity.loc[genre_popularity.groupby('year')['track_popularity'].idxmin()]

# Merge results for better comparison
popularity_summary = pd.merge(
    most_popular_genres,
    least_popular_genres,
    on='year',
    suffixes=('_most', '_least')
)

# Display the result
print(popularity_summary)

In [None]:
plt.figure(figsize=(14, 8))
plt.plot(popularity_summary['year'], popularity_summary['track_popularity_most'], label='Most Popular Genre', marker='o')
plt.plot(popularity_summary['year'], popularity_summary['track_popularity_least'], label='Least Popular Genre', marker='o')
plt.title('Most and Least Popular Genres Over Time', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Popularity', fontsize=14)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot the most popular genres over time
plt.figure(figsize=(20, 8))
sns.barplot(
    data=most_popular_genres,
    x='year',
    y='track_popularity',
    hue='playlist_genre',
    dodge=False,
    palette='tab10'
)
plt.title('Most Popular Genre by Year', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Popularity', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Playlist Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

#### Number of Tracks Released Over Years

In [None]:
# Count the number of tracks released per year
tracks_per_year = combined_df.groupby('year').size().reset_index(name='track_count')

# Plot the number of tracks released over years
plt.figure(figsize=(14, 8))
sns.lineplot(data=tracks_per_year, x='year', y='track_count', marker='o', color='blue')
plt.title('Number of Tracks Released Over Years', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Tracks', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.show()

#### Boxplots of Audio Features

In [None]:
plt.figure(figsize=(12, 20))
for i, feature in enumerate(audio_features):
    plt.subplot(4, 2, i + 1)
    sns.boxplot(data=combined_df, x='Popularity_Type', y=feature, hue='Popularity_Type', palette='Set2')
    plt.title(f'{feature.capitalize()} by Popularity Type')
    plt.xlabel('Popularity Type')
    plt.ylabel(feature.capitalize())

plt.tight_layout()
plt.show()

#### Genre-Based Analysis

In [None]:
# Count plot for genres by popularity type
plt.figure(figsize=(12, 6))
sns.countplot(data=combined_df, x='playlist_genre', hue='Popularity_Type', palette='Set2')
plt.title('Genre Distribution by Popularity Type')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

#### Scatter Plots for Feature Relationships

In [None]:
# Scatter plot for danceability vs energy
plt.figure(figsize=(8, 6))
sns.scatterplot(data=combined_df, x='danceability', y='energy', hue='Popularity_Type', alpha=0.7)
plt.title('Danceability vs Energy by Popularity Type')
plt.xlabel('Danceability')
plt.ylabel('Energy')
plt.legend(title='Popularity Type')
plt.show()

#### Tempo Distribution Across Genres

In [None]:
# Boxplot for tempo across genres
plt.figure(figsize=(12, 6))
sns.boxplot(data=combined_df, x='playlist_genre', y='tempo', hue='Popularity_Type', palette='Set2')
plt.title('Tempo Distribution Across Genres by Popularity Type')
plt.xlabel('Genre')
plt.ylabel('Tempo (BPM)')
plt.xticks(rotation=45)
plt.show()

#### Pair Plot for Numerical Features

In [None]:
# Select a subset of numerical features
selected_features = ['danceability', 'energy', 'valence', 'tempo']
sns.pairplot(combined_df[selected_features + ['Popularity_Type']], hue='Popularity_Type', diag_kind='kde')
plt.suptitle('Pair Plot of Selected Features by Popularity Type', y=1.02)
plt.show()

#### Liveness vs Acousticness

In [None]:
# Scatter plot for liveness vs acousticness
plt.figure(figsize=(8, 6))
sns.scatterplot(data=combined_df, x='liveness', y='acousticness', hue='Popularity_Type', alpha=0.7)
plt.title('Liveness vs Acousticness by Popularity Type')
plt.xlabel('Liveness')
plt.ylabel('Acousticness')
plt.legend(title='Popularity Type')
plt.show()

#### Duration Analysis

In [None]:
# Convert duration from milliseconds to minutes
combined_df['duration_min'] = combined_df['duration_ms'] / 60000

# Boxplot for duration in minutes by popularity type
plt.figure(figsize=(8, 6))
sns.boxplot(data=combined_df, x='Popularity_Type', y='duration_min', hue='Popularity_Type', palette='Set2')
plt.title('Track Duration by Popularity Type')
plt.xlabel('Popularity Type')
plt.ylabel('Duration (Minutes)')
plt.show()


These visualizations will provide a deeper understanding of the dataset's structure and relationships between features.

In [None]:
combined_df.head()

## Feature Engineering

Feature engineering is crucial for improving model performance by creating new features or transforming existing ones.

In [None]:
# Extract year from the release date
combined_df['release_year'] = combined_df['track_album_release_date'].dt.year

# Create a new feature: Song Age (current year - release year)
current_year = 2025  # Using the current year from the given date
combined_df['song_age'] = current_year - combined_df['release_year']

# Create interaction features
combined_df['energy_danceability_ratio'] = combined_df['energy'] / (combined_df['danceability'] + 1e-5)
combined_df['valence_energy_product'] = combined_df['valence'] * combined_df['energy']

# Create a binary feature for whether the track is from the 2000s or earlier
combined_df['is_2000s_or_earlier'] = (combined_df['release_year'] <= 2000).astype(int)

# Scale numerical features
scaler = StandardScaler()
numerical_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                      'instrumentalness', 'liveness', 'valence', 'tempo', 'song_age']
combined_df[numerical_features] = scaler.fit_transform(combined_df[numerical_features])

# Drop rows with missing values after feature engineering
combined_df = combined_df.dropna()

# Save the engineered dataset for modeling
combined_df.to_csv('engineered_spotify_data.csv', index=False)

## Modeling and Machine Learning

Now, let's proceed with modeling and machine learning using the engineered features:

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

# Load the engineered dataset
df = pd.read_csv('engineered_spotify_data.csv')

# Convert the target variable ('Popularity_Type') to numeric
df['Popularity_Type'] = df['Popularity_Type'].map({'High': 1, 'Low': 0})

# Identify non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns

# Encode categorical variables
le = LabelEncoder()
for col in non_numeric_columns:
    df[col] = le.fit_transform(df[col].astype(str))

# Prepare features and target variable
X = df.drop(['Popularity_Type', 'track_album_release_date'], axis=1)
y = df['Popularity_Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(15))
plt.title('Top 15 Most Important Features')
plt.show()

# Perform cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")



Based on the classification report, this Random Forest model for predicting track popularity on Spotify demonstrates excellent performance.

In conclusion, this Random Forest model demonstrates high accuracy in predicting Spotify track popularity based on audio features and metadata. Its balanced performance across classes makes it a reliable tool for understanding and predicting music popularity trends on the platform. However, as with any model, it should be used in conjunction with domain expertise and consideration of external factors not captured in the dataset.