In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Data Exploration

In [336]:
data = pd.read_csv("dataset.csv")
data.head()

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Permissions
0,8/27/2022 19:29:02,18.0,Spotify,3.0,Yes,Yes,Yes,Latin,Yes,Yes,...,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,,I understand.
1,8/27/2022 19:57:31,63.0,Pandora,1.5,Yes,No,No,Rock,Yes,No,...,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,,I understand.
2,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect,I understand.
3,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve,I understand.
4,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve,I understand.


In [338]:
data.shape

(736, 33)

In [340]:
data['Music effects'].value_counts()

Music effects
Improve      542
No effect    169
Worsen        17
Name: count, dtype: int64

In [342]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Timestamp                     736 non-null    object 
 1   Age                           735 non-null    float64
 2   Primary streaming service     735 non-null    object 
 3   Hours per day                 736 non-null    float64
 4   While working                 733 non-null    object 
 5   Instrumentalist               732 non-null    object 
 6   Composer                      735 non-null    object 
 7   Fav genre                     736 non-null    object 
 8   Exploratory                   736 non-null    object 
 9   Foreign languages             732 non-null    object 
 10  BPM                           629 non-null    float64
 11  Frequency [Classical]         736 non-null    object 
 12  Frequency [Country]           736 non-null    object 
 13  Frequ

# Data Preprocessing

In [345]:
null_counts = data.isnull().sum()
print("Null counts")
null_counts[null_counts > 0]

Null counts


Age                            1
Primary streaming service      1
While working                  3
Instrumentalist                4
Composer                       1
Foreign languages              4
BPM                          107
Music effects                  8
dtype: int64

In [347]:
data = data.dropna(subset=['Music effects'])

In [349]:
data = data.drop(["Permissions"],axis=1)

In [351]:
data['Age'] = data['Age'].fillna(data['Age'].mean())

In [353]:
fav_genre_bpm_mean = data.groupby('Fav genre')['BPM'].mean()
fav_genre_bpm_mean

Fav genre
Classical           1.145750e+02
Country             1.101905e+02
EDM                 1.475000e+02
Folk                1.097200e+02
Gospel              8.925000e+01
Hip hop             1.119062e+02
Jazz                1.244737e+02
K pop               1.294783e+02
Latin               1.225000e+02
Lofi                1.143000e+02
Metal               1.391026e+02
Pop                 1.189072e+02
R&B                 1.141667e+02
Rap                 1.271000e+02
Rock                1.240867e+02
Video game music    2.702714e+07
Name: BPM, dtype: float64

In [355]:
fav_genre_bpm_dict = fav_genre_bpm_mean.to_dict()
fav_genre_bpm_dict

{'Classical': 114.575,
 'Country': 110.19047619047619,
 'EDM': 147.5,
 'Folk': 109.72,
 'Gospel': 89.25,
 'Hip hop': 111.90625,
 'Jazz': 124.47368421052632,
 'K pop': 129.47826086956522,
 'Latin': 122.5,
 'Lofi': 114.3,
 'Metal': 139.10256410256412,
 'Pop': 118.90721649484536,
 'R&B': 114.16666666666667,
 'Rap': 127.1,
 'Rock': 124.08666666666667,
 'Video game music': 27027143.513513513}

In [357]:
def fill_bpm(row):
    if pd.isnull(row['BPM']):
        return fav_genre_bpm_dict.get(row['Fav genre'], data['BPM'].mean())
    return row['BPM']

In [359]:
data['BPM'] = data.apply(fill_bpm, axis=1)

In [361]:
null_counts = data.isnull().sum()
print("Null counts")
null_counts[null_counts > 0]

Null counts


Primary streaming service    1
While working                2
Instrumentalist              3
Foreign languages            3
dtype: int64

In [363]:
missing_val_cols = null_counts[null_counts > 0].to_dict().keys()
missing_val_cols

dict_keys(['Primary streaming service', 'While working', 'Instrumentalist', 'Foreign languages'])

In [365]:
for col in missing_val_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

In [367]:
yes_or_no_cols = ['While working', 'Instrumentalist', 'Composer', 'Exploratory', 'Foreign languages']

for col in yes_or_no_cols:
    data[col] = data[col].map({'Yes': 1, 'No': 0})

In [369]:
data = pd.get_dummies(data, columns=['Primary streaming service'], prefix='Streaming Service', drop_first=False)
data = pd.get_dummies(data, columns=['Fav genre'], prefix='Fav Genre', drop_first=False)

In [371]:
data['Frequency [Pop]'].value_counts()

Frequency [Pop]
Very frequently    274
Sometimes          257
Rarely             142
Never               55
Name: count, dtype: int64

In [373]:
frequency_cols = [col for col in data.columns if col.startswith('Frequency')]
frequency_cols

['Frequency [Classical]',
 'Frequency [Country]',
 'Frequency [EDM]',
 'Frequency [Folk]',
 'Frequency [Gospel]',
 'Frequency [Hip hop]',
 'Frequency [Jazz]',
 'Frequency [K pop]',
 'Frequency [Latin]',
 'Frequency [Lofi]',
 'Frequency [Metal]',
 'Frequency [Pop]',
 'Frequency [R&B]',
 'Frequency [Rap]',
 'Frequency [Rock]',
 'Frequency [Video game music]']

In [375]:
frequency_value_map = {
    'Very frequently': 3,
    'Sometimes': 2,
    'Rarely': 1,
    'Never': 0
}

for col in frequency_cols:
    data[col] = data[col].map(frequency_value_map)
data['Frequency [Pop]']

2      1
3      2
4      2
5      3
6      1
      ..
731    3
732    3
733    2
734    0
735    2
Name: Frequency [Pop], Length: 728, dtype: int64

In [377]:
le = LabelEncoder()
data['Music effects'] = le.fit_transform(data['Music effects'])

In [379]:
data.head()

Unnamed: 0,Timestamp,Age,Hours per day,While working,Instrumentalist,Composer,Exploratory,Foreign languages,BPM,Frequency [Classical],...,Fav Genre_Jazz,Fav Genre_K pop,Fav Genre_Latin,Fav Genre_Lofi,Fav Genre_Metal,Fav Genre_Pop,Fav Genre_R&B,Fav Genre_Rap,Fav Genre_Rock,Fav Genre_Video game music
2,8/27/2022 21:28:18,18.0,4.0,0,0,0,0,1,132.0,0,...,False,False,False,False,False,False,False,False,False,True
3,8/27/2022 21:40:40,61.0,2.5,1,0,1,1,1,84.0,2,...,True,False,False,False,False,False,False,False,False,False
4,8/27/2022 21:54:47,18.0,4.0,1,0,0,1,0,107.0,0,...,False,False,False,False,False,False,True,False,False,False
5,8/27/2022 21:56:50,18.0,5.0,1,1,1,1,1,86.0,1,...,True,False,False,False,False,False,False,False,False,False
6,8/27/2022 22:00:29,18.0,3.0,1,1,0,1,1,66.0,2,...,False,False,False,False,False,False,False,False,False,True


In [381]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 728 entries, 2 to 735
Data columns (total 52 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Timestamp                                            728 non-null    object 
 1   Age                                                  728 non-null    float64
 2   Hours per day                                        728 non-null    float64
 3   While working                                        728 non-null    int64  
 4   Instrumentalist                                      728 non-null    int64  
 5   Composer                                             728 non-null    int64  
 6   Exploratory                                          728 non-null    int64  
 7   Foreign languages                                    728 non-null    int64  
 8   BPM                                                  728 non-null    float6

In [383]:
data.describe()

Unnamed: 0,Age,Hours per day,While working,Instrumentalist,Composer,Exploratory,Foreign languages,BPM,Frequency [Classical],Frequency [Country],...,Frequency [Pop],Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects
count,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,...,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0
mean,25.143054,3.591415,0.789835,0.318681,0.171703,0.715659,0.553571,1633624.0,1.336538,0.818681,...,2.03022,1.255495,1.336538,2.074176,1.251374,5.849588,4.803571,3.753434,2.645604,0.278846
std,11.919787,3.038554,0.407706,0.466286,0.377382,0.45141,0.497464,37146690.0,0.991399,0.923532,...,0.934833,1.061051,1.054595,1.033147,1.071706,2.787469,3.024069,3.095676,2.844204,0.498132
min,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,2.0,1.0,0.0,0.0,0.0,0.0,104.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,4.0,2.0,1.0,0.0,0.0
50%,21.0,3.0,1.0,0.0,0.0,1.0,1.0,120.0,1.0,1.0,...,2.0,1.0,1.0,2.0,1.0,6.0,5.0,3.0,2.0,0.0
75%,28.0,5.0,1.0,1.0,0.0,1.0,1.0,140.0,2.0,1.0,...,3.0,2.0,2.0,3.0,2.0,8.0,7.0,6.0,5.0,1.0
max,89.0,24.0,1.0,1.0,1.0,1.0,1.0,1000000000.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,10.0,10.0,10.0,10.0,2.0


In [385]:
null_counts = data.isnull().sum()
print("Null counts")
null_counts[null_counts > 0]

Null counts


Series([], dtype: int64)