In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

np.random.seed(42)
# Take in the dataset
dataset = pd.read_csv('mxmh_survey_results.csv')
dataset

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Permissions
0,8/27/2022 19:29:02,18.0,Spotify,3.0,Yes,Yes,Yes,Latin,Yes,Yes,...,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,,I understand.
1,8/27/2022 19:57:31,63.0,Pandora,1.5,Yes,No,No,Rock,Yes,No,...,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,,I understand.
2,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect,I understand.
3,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve,I understand.
4,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve,I understand.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,10/30/2022 14:37:28,17.0,Spotify,2.0,Yes,Yes,No,Rock,Yes,Yes,...,Never,Rarely,Very frequently,Never,7.0,6.0,0.0,9.0,Improve,I understand.
732,11/1/2022 22:26:42,18.0,Spotify,1.0,Yes,Yes,No,Pop,Yes,Yes,...,Never,Never,Sometimes,Sometimes,3.0,2.0,2.0,5.0,Improve,I understand.
733,11/3/2022 23:24:38,19.0,Other streaming service,6.0,Yes,No,Yes,Rap,Yes,No,...,Sometimes,Sometimes,Rarely,Rarely,2.0,2.0,2.0,2.0,Improve,I understand.
734,11/4/2022 17:31:47,19.0,Spotify,5.0,Yes,Yes,No,Classical,No,No,...,Never,Never,Never,Sometimes,2.0,3.0,2.0,1.0,Improve,I understand.


In [2]:
# Drop unnecessary columns
drop_columns = ['Permissions', 'While working', 'Timestamp', 'Instrumentalist', 'Composer', 'Exploratory']
dataset = dataset.drop(columns=drop_columns)

In [3]:
# Handling outliers
z_scores = zscore(dataset['Age'])
outlier_indices = np.abs(z_scores) > 3
dataset = dataset[~outlier_indices]

z_scores = zscore(dataset['Hours per day'])
outlier_indices = np.abs(z_scores) > 3
dataset = dataset[~outlier_indices]

In [4]:
dataset.describe()

Unnamed: 0,Age,Hours per day,BPM,Anxiety,Depression,Insomnia,OCD
count,725.0,726.0,620.0,726.0,726.0,726.0,726.0
mean,25.124138,3.369904,1613026.0,5.839532,4.785124,3.708678,2.62259
std,11.830807,2.454285,40160960.0,2.788537,3.007113,3.068783,2.828205
min,10.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,2.0,100.0,4.0,2.0,1.0,0.0
50%,21.0,3.0,120.0,6.0,5.0,3.0,2.0
75%,28.0,4.0,144.0,8.0,7.0,6.0,4.0
max,80.0,12.0,1000000000.0,10.0,10.0,10.0,10.0


In [5]:
# Handling outliers
def remove_outliers_iqr(data, column):
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

dataset = remove_outliers_iqr(dataset, 'Age')
dataset = remove_outliers_iqr(dataset, 'Hours per day')
dataset = remove_outliers_iqr(dataset, 'BPM')

In [6]:
# Adding a mental health score column
dataset["Mental health score"] = (dataset['Anxiety'] + dataset['Depression'] + dataset['OCD'] + dataset['Insomnia'])

# Convert continuous target variable to discrete categories
bins = [0, 10, 20, 30, 40]
labels = ['Very low', 'Low', 'Moderate', 'High']
dataset['Mental health category'] = pd.cut(dataset['Mental health score'], bins=bins, labels=labels)
dataset

Unnamed: 0,Age,Primary streaming service,Hours per day,Fav genre,Foreign languages,BPM,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],...,Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Mental health score,Mental health category
0,18.0,Spotify,3.0,Latin,Yes,156.0,Rarely,Never,Rarely,Never,...,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,,4.0,Very low
2,18.0,Spotify,4.0,Video game music,Yes,132.0,Never,Never,Very frequently,Never,...,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect,26.0,Moderate
4,18.0,Spotify,4.0,R&B,No,107.0,Never,Never,Rarely,Never,...,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve,23.0,Moderate
5,18.0,Spotify,5.0,Jazz,Yes,86.0,Rarely,Sometimes,Never,Never,...,Very frequently,Very frequently,Never,8.0,8.0,7.0,7.0,Improve,30.0,Moderate
6,18.0,YouTube Music,3.0,Video game music,Yes,66.0,Sometimes,Never,Rarely,Sometimes,...,Never,Never,Sometimes,4.0,8.0,6.0,0.0,Improve,18.0,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,17.0,Spotify,2.0,Rock,Yes,120.0,Very frequently,Rarely,Never,Sometimes,...,Rarely,Very frequently,Never,7.0,6.0,0.0,9.0,Improve,22.0,Moderate
732,18.0,Spotify,1.0,Pop,Yes,160.0,Rarely,Rarely,Never,Never,...,Never,Sometimes,Sometimes,3.0,2.0,2.0,5.0,Improve,12.0,Low
733,19.0,Other streaming service,6.0,Rap,No,120.0,Rarely,Sometimes,Sometimes,Rarely,...,Sometimes,Rarely,Rarely,2.0,2.0,2.0,2.0,Improve,8.0,Very low
734,19.0,Spotify,5.0,Classical,No,170.0,Very frequently,Never,Never,Never,...,Never,Never,Sometimes,2.0,3.0,2.0,1.0,Improve,8.0,Very low


In [7]:
# Convert categorical columns to one hot encoded columns
categorical_columns = dataset.select_dtypes(include=['object']).columns
dataset = pd.get_dummies(dataset, columns=categorical_columns, drop_first=True)

# Define features and target
X = dataset.drop(['Mental health score', 'Mental health category'], axis=1)
y = dataset['Mental health category']

In [8]:
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into train, test and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X, y, test_size=0.5, random_state=42)
# Splitting test set in half, one half remains in test set and the other goes to the validation set

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    # SimpleImputer replaces all missing values with the median
    ('scaler', StandardScaler())])
    # StandardScaler makes each feature have a standard deviation of 1 and a mean of 0

    # NOTE: StandardScaler does not modify the data, it only transforms it to be easier to read by a Machine Learning model.
    # This means the data sent into the model is the exact same as the dataset, only its been standardized
    # More info on https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])
    # This instance of SimpleImputer replaces missing values with the one that appears most frequently

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', categorical_transformer, X.select_dtypes(include=['object']).columns)
    ]
)

# Creating pipeline with preprocessing and classifier GaussianNB()
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', GaussianNB())])

# Fit the model
pipeline.fit(X_train, y_train)

In [9]:
# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy *= 100

print("Testing Set Metrics:")
print("Accuracy: ", accuracy, "%", sep="")

Testing Set Metrics:
Accuracy: 88.62745098039215%


In [10]:
# Make predictions on the validation set
y_pred_valid = pipeline.predict(X_valid)

# Evaluate the model on the validation set
accuracy_valid = accuracy_score(y_valid, y_pred_valid)
accuracy_valid *= 100

print("Validation Set Metrics:")
print("Accuracy: ", accuracy_valid, "%", sep="")

Validation Set Metrics:
Accuracy: 89.41176470588236%


In [11]:
X_test

Unnamed: 0,Age,Hours per day,BPM,Anxiety,Depression,Insomnia,OCD,Primary streaming service_I do not use a streaming service.,Primary streaming service_Other streaming service,Primary streaming service_Pandora,...,Frequency [Rap]_Sometimes,Frequency [Rap]_Very frequently,Frequency [Rock]_Rarely,Frequency [Rock]_Sometimes,Frequency [Rock]_Very frequently,Frequency [Video game music]_Rarely,Frequency [Video game music]_Sometimes,Frequency [Video game music]_Very frequently,Music effects_No effect,Music effects_Worsen
104,20.0,3.0,170.0,6.0,7.0,0.0,0.0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
52,23.0,1.0,140.0,10.0,6.0,3.0,10.0,0,0,0,...,0,1,0,0,1,1,0,0,1,0
435,15.0,6.0,110.0,8.0,8.0,7.0,10.0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
262,21.0,3.0,108.0,0.0,8.0,0.0,5.0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
350,18.0,0.5,66.0,5.0,3.0,1.0,0.0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,32.0,1.0,120.0,3.0,6.0,2.0,0.0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
361,17.0,3.0,104.0,5.0,6.0,9.0,9.0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
481,28.0,1.0,82.0,4.0,8.0,5.0,0.0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
634,20.0,4.0,180.0,5.0,2.0,1.0,6.0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


In [12]:
print(y_test)

# NOTE: 0 - [Zero Category]
#       1 - [Very Low 0-10]
#       2 - [Low 10-20]
#       3 - [Moderate 20-30]
#       4 - [High 30-40]

[1 2 0 1 3 2 3 2 1 2 1 1 2 2 2 2 1 1 2 1 3 1 3 1 2 2 3 2 3 1 3 1 1 3 1 1 1
 1 2 1 1 2 1 0 3 1 2 2 1 1 1 1 1 3 1 1 2 3 1 2 2 1 2 2 2 1 1 2 1 2 3 3 0 1
 1 0 3 1 3 1 1 2 2 1 1 1 2 1 3 2 2 2 1 2 4 1 1 3 1 2 3 2 1 1 2 3 2 2 3 2 2
 1 1 1 2 3 2 1 3 1 1 1 1 3 2 3 2 1 2 2 1 2 1 1 1 3 3 3 3 1 0 2 2 1 3 3 2 1
 1 2 3 3 1 1 3 1 2 0 3 2 1 2 3 3 1 3 3 1 3 2 2 1 1 2 2 2 3 0 1 3 2 3 3 2 3
 1 3 2 1 3 1 1 1 2 2 1 3 1 2 1 3 1 1 1 1 1 1 0 1 2 1 2 1 2 2 1 2 2 1 1 2 4
 3 1 1 1 2 1 3 2 3 2 1 1 2 1 1 1 1 0 1 2 1 1 2 1 1 1 1 1 1 2 1 1 3]


In [13]:
print(y_pred)

# NOTE: 0 - [Zero Category]
#       1 - [Very Low 0-10]
#       2 - [Low 10-20]
#       3 - [Moderate 20-30]
#       4 - [High 30-40]

[1 2 0 1 3 2 1 2 1 2 1 1 2 2 2 2 1 1 1 1 3 1 3 1 2 2 1 1 3 1 3 2 1 3 1 1 1
 1 2 1 1 2 3 0 1 1 2 2 1 1 1 1 1 3 1 1 1 3 1 2 1 1 2 2 1 1 1 2 1 2 3 3 2 1
 1 0 3 1 3 1 1 2 2 1 1 1 2 1 3 2 1 2 1 2 4 1 1 1 1 1 3 2 1 1 2 3 2 2 3 2 2
 1 1 1 2 3 2 1 3 1 1 1 2 3 2 3 2 1 2 2 1 1 1 1 1 3 3 3 1 1 0 2 2 1 3 3 2 1
 1 2 3 1 1 1 3 2 2 0 3 2 1 2 3 3 1 3 3 1 3 2 2 1 1 2 2 1 1 2 1 1 2 1 3 2 3
 1 3 2 1 3 1 1 1 2 2 1 3 1 2 3 1 1 1 1 1 1 1 2 1 2 1 2 1 2 2 1 2 2 1 1 2 4
 3 1 1 1 2 2 3 2 3 2 1 1 2 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 2 1 1 3]


In [14]:
reg_model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
reg_model_diff

Unnamed: 0,Actual value,Predicted value
0,1,1
1,2,2
2,0,0
3,1,1
4,3,3
...,...,...
250,1,1
251,2,2
252,1,1
253,1,1


In [15]:
# Extract the favorite genre columns from the original dataset
favorite_genre_columns = [col for col in dataset.columns if 'Fav genre_' in col]

# Create a DataFrame to store conditional probabilities
conditional_probs = pd.DataFrame(columns=labels, index=favorite_genre_columns)

# Iterate over each favorite genre column
for genre_col in favorite_genre_columns:
    # Iterate over each mental health category
    for category in labels:
        # Filter the dataset for the current mental health category and favorite genre
        subset = dataset[dataset['Mental health category'] == category][genre_col]
        # Calculate the probability of having the current mental health category given the favorite genre
        prob = subset.sum() / len(subset)
        # Store the probability in the DataFrame
        conditional_probs.loc[genre_col, category] = prob

# Display conditional probabilities
print("Conditional Probabilities of Favorite Genre given Mental Health Category:")
print(conditional_probs)


Conditional Probabilities of Favorite Genre given Mental Health Category:
                            Very low       Low  Moderate      High
Fav genre_Country           0.046296  0.046512  0.012195  0.052632
Fav genre_EDM               0.064815   0.04186   0.04878  0.052632
Fav genre_Folk              0.018519  0.065116   0.02439  0.105263
Fav genre_Hip hop           0.009259  0.065116  0.060976  0.052632
Fav genre_Jazz              0.027778  0.023256   0.02439       0.0
Fav genre_K pop             0.064815  0.027907  0.042683  0.052632
Fav genre_Latin             0.009259  0.004651       0.0       0.0
Fav genre_Lofi                   0.0  0.023256  0.018293  0.052632
Fav genre_Metal              0.12963  0.097674  0.146341  0.105263
Fav genre_Pop               0.157407  0.195349  0.140244  0.157895
Fav genre_R&B               0.101852  0.018605  0.054878  0.052632
Fav genre_Rap               0.064815  0.027907   0.02439       0.0
Fav genre_Rock              0.157407   0.24186  0.28048

In [16]:
# Concatenate one-hot encoded genres with mental health score
data_with_mental_health = pd.concat([dataset, dataset['Mental health score']], axis=1)

# Filter dataset to include only columns about if a person likes a specific genre
filtered_columns = ['Fav genre_Country', 'Fav genre_Video game music', 'Fav genre_Metal', 'Fav genre_Pop',
                    'Fav genre_EDM', 'Fav genre_R&B', 'Fav genre_K pop', 'Fav genre_Hip hop',
                    'Fav genre_Folk', 'Fav genre_Jazz','Fav genre_Latin', 'Fav genre_Lofi',
                    'Fav genre_Rap', 'Fav genre_Rock']

# Create a new column representing the combination of favorite music genres
data_with_mental_health['Music Genre Combination'] = data_with_mental_health[filtered_columns].apply(lambda x: '_'.join(map(str, x)), axis=1)

# Making Favorite Genre column for readability
data_with_mental_health['Favorite Genre'] = data_with_mental_health[filtered_columns].idxmax(axis=1).str.replace('Fav genre_', '')

# Group by the new column and calculate the average mental health score
average_mental_health_score = data_with_mental_health.groupby(['Music Genre Combination', 'Favorite Genre'])['Mental health score'].mean().reset_index()

print("Average Mental Health Score for each Favorite Music Genre Combination:\n")
print(average_mental_health_score)

Average Mental Health Score for each Favorite Music Genre Combination:

        Music Genre Combination    Favorite Genre  Mental health score  \
0   0_0_0_0_0_0_0_0_0_0_0_0_0_0           Country            16.571429   
1   0_0_0_0_0_0_0_0_0_0_0_0_0_1              Rock            18.656780   
2   0_0_0_0_0_0_0_0_0_0_0_0_1_0               Rap            14.588235   
3   0_0_0_0_0_0_0_0_0_0_0_1_0_0              Lofi            21.000000   
4   0_0_0_0_0_0_0_0_0_0_1_0_0_0             Latin            10.000000   
5   0_0_0_0_0_0_0_0_0_1_0_0_0_0              Jazz            16.416667   
6   0_0_0_0_0_0_0_0_1_0_0_0_0_0              Folk            17.909091   
7   0_0_0_0_0_0_0_1_0_0_0_0_0_0           Hip hop            18.269231   
8   0_0_0_0_0_0_1_0_0_0_0_0_0_0             K pop            16.142857   
9   0_0_0_0_0_1_0_0_0_0_0_0_0_0               R&B            15.240000   
10  0_0_0_0_1_0_0_0_0_0_0_0_0_0               EDM            15.615385   
11  0_0_0_1_0_0_0_0_0_0_0_0_0_0         