In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.ticker as ticker
import seaborn as sns

In [23]:
artists_dataset = pd.read_csv('dataset/tabular/artists.csv')
tracks_dataset = pd.read_csv('dataset/tabular/tracks.csv')

In [None]:
tracks_dataset

# Artists Dataset

Dropping rows with NaN and duplicated lines from artists dataset

In [24]:
# Drop the rows with missing values
artists_dataset = artists_dataset.dropna()
# Drop the duplicated rows
artists_dataset = artists_dataset.drop_duplicates()

In [25]:
#Drop all the artists with same name and same genres
artists_dataset = artists_dataset.sort_values('popularity', ascending=False).drop_duplicates(['name', 'genres'])
# Display the resulting dataframe
artists_dataset

Unnamed: 0,id,name,popularity,followers,genres
13064,06HL4z0CvFAxyc27GXpf02,Taylor Swift,100.0,82377431.0,['pop']
4086,3TVXtAsR1Inumwj472S9r4,Drake,94.0,78233981.0,"['canadian hip hop', 'canadian pop', 'hip hop'..."
308,1Xyo4u8uXC1ZmMpatF05PJ,The Weeknd,94.0,69417099.0,"['canadian contemporary r&b', 'canadian pop', ..."
4327,4q3ewBCX7sLwd24euuV69X,Bad Bunny,94.0,72659297.0,"['reggaeton', 'trap latino', 'urbano latino']"
6559,0Y5tJX1MQlPlqiwlOH1tJY,Travis Scott,94.0,23519791.0,"['hip hop', 'rap', 'slap house']"
...,...,...,...,...,...
25709,1qLPQciZ1gRoYAd5zusgGe,Martina Rudic,0.0,12.0,[]
841,4V8YX6UEzcFLmHUh2HfJux,Jean-Philippe Audin,0.0,12.0,[]
15434,4u7SioU0plZ0L4kBCEdj47,Alexei Kuzmin,0.0,4.0,[]
25735,2vh8ZeDiygDZkW4JUtgRwH,Azhirock,0.0,194.0,['iranian metal']


In [28]:
# Now let's check if we have duplicate names
duplicated_rows = artists_dataset[artists_dataset.duplicated('name', keep=False)]
duplicated_rows

Unnamed: 0,id,name,popularity,followers,genres
4331,2jSGzJw0ebJLu7OLVSOcBP,Plan B,77.0,3265393.0,"['reggaeton', 'trap latino', 'urbano latino']"
5941,4W12lEvVl5wectUtRNrvDh,White Noise Baby Sleep,75.0,31191.0,['sleep']
4158,4fEkbug6kZzzJ8eYX6Kbbp,KK,75.0,12587568.0,"['filmi', 'modern bollywood']"
258,1wxPItEzr7U7rGSMPqZ25r,Shiloh Dynasty,75.0,1609334.0,"['lo-fi chill', 'sad lo-fi']"
20678,3CJKkU0XuElRT1z8rEtIYg,Luciano,72.0,1915232.0,"['german drill', 'german hip hop']"
...,...,...,...,...,...
9427,6qp3DqptxGnT0LAjM71V8B,Fery,0.0,2780.0,['anime lo-fi']
4321,6J6xE014tm1XdKAzOYbOkL,Marley Carroll,0.0,1.0,[]
7333,3ggp4QNbijnG11uuPMVQ0v,Smartface,0.0,11.0,[]
488,41tlGPBvFWRfo0lU6PZEcF,Shiloh Dynasty,0.0,13240.0,[]


# Tracks Dataset

## Aggregated all rows with duplicated 'id' values into a single row, keeping the unique genres in a list. Then dropped the duplicated rows from the original dataset and added the aggregated rows.


In [None]:
merged_df = tracks_dataset.groupby('id')['genre'].agg(list)
df_merged = pd.merge(tracks_dataset, merged_df, on='id', how='left')

# Find the indices of the rows with the highest popularity within each group (ID)
indices_to_keep = df_merged.groupby('id')['popularity'].idxmax()
# Filter the dataframe to keep only the rows with the highest popularity within each group
tracks_dataset = df_merged.loc[indices_to_keep]
tracks_dataset.rename(columns={'genre_y': 'genre'}, inplace=True)
tracks_dataset = tracks_dataset.drop(columns=['genre_x'])
# Display the resulting dataframe
tracks_dataset

## Handling songs with duplicated names

In [None]:
# Drop all the duplicates rows with same 'name' and 'artists' values, and keep only the row with the highest popularity
tracks_dataset = tracks_dataset.sort_values('popularity', ascending=False).drop_duplicates(['name', 'artists'])
# Display the resulting dataframe
tracks_dataset

In [None]:
# which are the 3 distinct values of 'album_release_date_precision'?
tracks_dataset['album_release_date_precision'].unique()

In [None]:
# count how many rows have each distinct value of 'album_release_date_precision'
tracks_dataset['album_release_date_precision'].value_counts()

## Creating 3 new columns: 'release_year', 'release_month', 'release_day' from the 'release_date' column

In [None]:
# Extract year, month, and day from 'album_release_date'
# If 'album_release_date' is in YYYY-MM format, the day will be set as NaN

# First, ensure 'album_release_date' is a string to safely apply string operations
tracks_dataset['album_release_date'] = tracks_dataset['album_release_date'].astype(str)

# Split 'album_release_date' into year, month, and day
tracks_dataset['year'] = tracks_dataset['album_release_date'].apply(lambda x: x.split('-')[0])
tracks_dataset['month'] = tracks_dataset['album_release_date'].apply(lambda x: x.split('-')[1] if len(x.split('-')) > 1 else 'NaN')
tracks_dataset['day'] = tracks_dataset['album_release_date'].apply(lambda x: x.split('-')[2] if len(x.split('-')) > 2 else 'NaN')

# Display the first few rows to verify the new columns
tracks_dataset[['album_release_date', 'year', 'month', 'day']]

## Dropping columns track_number, disc_number, album_type, album_total_tracks 

In [ ]:
# Drop the columns 'track_number', 'disc_number', 'album_type', 'album_total_tracks'
tracks_dataset = tracks_dataset.drop(columns=['track_number', 'disc_number', 'album_type', 'album_total_tracks', 'album_release_date_precision', 'album_release_date'])
