In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


# Importing the Data
df = pd.read_csv(r'...\FuckItMusic-music-export.csv')
print(df.head())

   RYM Album  First Name       Last Name First Name localized  \
0      28324         NaN             !!!                  NaN   
1   10326582         NaN        100 gecs                  NaN   
2    4252371         NaN  Action Bronson                  NaN   
3      13932         NaN     Adolescents                  NaN   
4    8239019         NaN         Algiers                  NaN   

   Last Name localized                   Title  Release_Date  Rating  \
0                  NaN                     !!!          2000       5   
1                  NaN               1000 gecs          2019       3   
2                  NaN        Rare Chandeliers          2012       6   
3                  NaN             Adolescents          1981       7   
4                  NaN  The Underside of Power          2017       8   

  Ownership  Purchase Date Media Type  
0         n            NaN        NaN  
1         n            NaN        NaN  
2         n            NaN        NaN  
3         n     

In [2]:
# Columns and Objects
print(df.shape)

(1756, 11)


In [3]:
# Types of Data
print(df.dtypes)

RYM Album                 int64
 First Name              object
Last Name                object
First Name localized     object
 Last Name localized     object
Title                    object
Release_Date              int64
Rating                    int64
Ownership                object
Purchase Date           float64
Media Type               object
dtype: object


In [4]:
# More descriptions of the data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1756 entries, 0 to 1755
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   RYM Album             1756 non-null   int64  
 1    First Name           105 non-null    object 
 2   Last Name             1756 non-null   object 
 3   First Name localized  1 non-null      object 
 4    Last Name localized  2 non-null      object 
 5   Title                 1756 non-null   object 
 6   Release_Date          1756 non-null   int64  
 7   Rating                1756 non-null   int64  
 8   Ownership             1756 non-null   object 
 9   Purchase Date         0 non-null      float64
 10  Media Type            2 non-null      object 
dtypes: float64(1), int64(3), object(7)
memory usage: 151.0+ KB
None


In [5]:
# Checking if any values are null
df.isnull().sum()

RYM Album                  0
 First Name             1651
Last Name                  0
First Name localized    1755
 Last Name localized    1754
Title                      0
Release_Date               0
Rating                     0
Ownership                  0
Purchase Date           1756
Media Type              1754
dtype: int64

In [6]:
# Dropping columns with empty values
df.drop(columns=['Ownership', 'Purchase Date', 'Media Type'], inplace=True)
df.head()

Unnamed: 0,RYM Album,First Name,Last Name,First Name localized,Last Name localized,Title,Release_Date,Rating
0,28324,,!!!,,,!!!,2000,5
1,10326582,,100 gecs,,,1000 gecs,2019,3
2,4252371,,Action Bronson,,,Rare Chandeliers,2012,6
3,13932,,Adolescents,,,Adolescents,1981,7
4,8239019,,Algiers,,,The Underside of Power,2017,8


In [7]:
# Data Cleaning

# Creating a new column "Artist Name"
df['Artist Name'] = df[' First Name'].astype(str) + ' ' +  df['Last Name']

# Creating a new column " Localized Name" for special cases
df['Localized Name'] = df['First Name localized'] + ' ' + df[' Last Name localized']

# Dropping redudant columns
df.drop(columns=[' First Name', 'Last Name', 'First Name localized', ' Last Name localized'],inplace=True)

# Renaming column for clarity
df.rename(columns= {'Release_Date': 'Year_Of_Release'}, inplace=True)

# Clearing spaces and NaN values in Artist Name column
df['Artist Name'] = df['Artist Name'].str.lstrip('nan')
df['Artist Name'] = df['Artist Name'].str.strip()

# Replacing special characters with their appropriate replacement for readability
df.replace({'&amp;': 'and'}, regex=True, inplace=True)
df['Artist Name'] = df['Artist Name'].apply(lambda x: x.replace('$', 's') if '$' in x else x)

# Replacing odd names with their appropriate replacement
df.replace({'Илья Рачковский': 'Ilya Rachkovsky'}, regex=True, inplace=True)
df.replace({'AsAp Ferg': 'ASAP Ferg'}, regex=True, inplace=True)

# Finally, dropping the Localized Name column for being redundant
df.drop(columns=['Localized Name'],inplace=True)

In [8]:
# Viewing the cleaned dataset
df.head()

Unnamed: 0,RYM Album,Title,Year_Of_Release,Rating,Artist Name
0,28324,!!!,2000,5,!!!
1,10326582,1000 gecs,2019,3,100 gecs
2,4252371,Rare Chandeliers,2012,6,Action Bronson
3,13932,Adolescents,1981,7,Adolescents
4,8239019,The Underside of Power,2017,8,Algiers


In [None]:
# Pie Chart for distribution of ratings
fig = px.pie(
    df, 
    names='Rating',
    height=600,
    width=1000,
    title="Distribution of Ratings",)
fig.update_traces(textposition = 'inside', textinfo = 'percent+label')

fig.show()


In [10]:
# The mean rating of the dataset
mean_rating = df['Rating'].mean()
print(mean_rating)

7.004555808656036


In [None]:
# Finding the mean rating by decade
avg_by_decade = df.groupby((df.Year_Of_Release//10)*10)['Rating'].mean().reset_index()
fig = px.bar(
    x=avg_by_decade['Year_Of_Release'], 
    y=avg_by_decade['Rating'],
    labels={'y':'Rating Scale', 'x':'Decade'}, 
    title='Average Rating By Decade'
)

fig.show()


In [None]:
# Scatter Plot Creation

# Defining a threshold 
rating_threshold = 1

# Grouping Artist by the amount of ratings they have
total_ratings = df.groupby(['Artist Name']).agg(Rating_count=('Rating', 'count')).reset_index()

# Eliminating artist that fall below the threshold
sorted_ratings = total_ratings[total_ratings['Rating_count'] > rating_threshold]

# Creating custom hover text
sorted_ratings['hover text'] = sorted_ratings.apply(
    lambda row: f'Artist: {row['Artist Name']}<br>Rating Count: {row['Rating_count']}', axis=1
)

# Graphing the data
fig = go.Figure(go.Scatter(
    mode='markers',
    x=sorted_ratings.index,
    y=sorted_ratings['Rating_count'],
    marker_color=sorted_ratings['Rating_count'],
    marker_size=10,
    text=sorted_ratings['hover text'],
    hoverinfo='text'

))

# Updating the labels for readability
fig.update_layout(
    title={
        'text': 'Total Artist Ratings',
        'x': 0.5,
        'xanchor': 'center',
        'y': 0.9,
        'yanchor': 'top'    
    },
    xaxis_title = 'Artist',
    yaxis_title = 'Rating Count'
)

# Showing the graph
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# Scatter Plot to show the Artist with the highest mean rating

# Threshold based on the 5 album rule(Great artist at least have 5 good albums)
rating_threshold = 5

# Grouping Artist based on their mean rating and the total count of ratings
artist_average = df.groupby(['Artist Name']).agg(
    Rating_count=('Rating', 'count'),
    Rating_mean=('Rating', 'mean')
).reset_index()

# Filtering based on rating threshold
average_threshold = artist_average[artist_average['Rating_count'] >= rating_threshold]

# Sorting by Rating_mean in descending order
average_threshold_sorted = average_threshold.sort_values(by='Rating_mean', ascending=True)

# Creating custom hover text
average_threshold_sorted['hover_text'] = average_threshold_sorted.apply(
    lambda row: f"Artist: {row['Artist Name']}<br>Rating Mean: {row['Rating_mean']}", axis=1
)

# Creating the scatter plot
fig = go.Figure(go.Scatter(
    mode="markers",
    x=average_threshold_sorted.index, 
    y=average_threshold_sorted['Rating_mean'],  
    marker_symbol='circle',
    marker_color=average_threshold_sorted['Rating_mean'],  
    marker_size=12,
    text=average_threshold_sorted['hover_text'], 
    hoverinfo='text'
))

# Updating layout for readability
fig.update_layout(
    title={
        'text': 'Artist Rating Mean (Sorted)',
        'x': 0.5,
        'xanchor': 'center',
        'y': 0.90,
        'yanchor': 'top'    
    },
    xaxis_title='Artists',
    yaxis_title='Rating Mean',
)
    
# Showing the graph
fig.show()


In [None]:
# Boxplot Creation for total visualization of the data

# Grpuping data by the Decade of music
the_50s = df[(df['Year_Of_Release']>= 1950) & (df['Year_Of_Release'] < 1960)]
the_60s = df[(df['Year_Of_Release']>= 1960) & (df['Year_Of_Release'] < 1970)] 
the_70s = df[(df['Year_Of_Release']>= 1970) & (df['Year_Of_Release'] < 1980)]
the_80s = df[(df['Year_Of_Release']>= 1980) & (df['Year_Of_Release'] < 1990)]
the_90s = df[(df['Year_Of_Release']>= 1990) & (df['Year_Of_Release'] < 2000)]
the_00s = df[(df['Year_Of_Release']>= 2000) & (df['Year_Of_Release'] < 2010)]
the_10s = df[(df['Year_Of_Release']>= 2010) & (df['Year_Of_Release'] < 2020)]
the_20s = df[(df['Year_Of_Release']>= 2020) & (df['Year_Of_Release'] < 2030)]

# Grouping Data Together
decades_of_music = [the_50s, the_60s,the_70s, the_80s, the_90s, the_00s, the_10s, the_20s]

# Labels
decade_labels = ['1950\'s', '1960\'s', '1970\'s', '1980\'s', '1990\'s', '2000\'s', '2010\'s', '2020\'s']

# Graph the data
fig = go.Figure()

for data, label in zip(decades_of_music, decade_labels):
    fig.add_trace(go.Box(
        y=data['Rating'], 
        name=label,                
        boxmean='sd',                  
        line=dict(width=2),            
        marker=dict(size=6),          
        boxpoints='all',               
    ))

# Updating labels for readability
fig.update_layout(
    title_text = 'Distribution of Ratings(by Decade)',
    xaxis_title = 'Decades Of Music',
    yaxis_title = 'Rating Score',
)

# Showing the data
fig.show()
