# 1. How similarly do listeners describe a performance of a piece?

Probably the first question that arises concerns the similarity of the descriptions in the dataset, i.e., whether there are commonalities in the way listeners describe and like performances.

In this notebook we present a sereis of analyses that provide different perspectives on the data.


In [1]:
# import matplotlib
# matplotlib.use('Qt5Agg')
import os

import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
import pandas as pd

# from scripts.utils import davies_bouldin
# import scipy.stats as spstats
from stat_utils import LinearModel, standardize, cod, cohens_f2_from_data, correlation, f_oneway

## Complexity of the Descriptions

An interesting question is whether there is a relation between the musical background of the listeners and the complexity of their answers.
In particular 

Using the Dale-Chall readability score, we can estimate.

Answers that are a single (common) adjective are rated with a low score (meaning that would be easily understood by an average 4th-grade student), and answers that are longer use technical (i.e., musical) terms have a higher score.

The following plot is a histogram of the values of the readability score. From this plot we can see that this is a bi-modal distribution, meaning that the answers tend to be either consist of simple (i.e., common) adjectives, or they are longer and contain specialized vocabulary.

In [None]:
readability = df_biggame['answer_readability']
# Adjust readability score to the meaningful range defined in Dale and Chall (1948)
readability_adj = np.clip(readability, 4, 10)

fig, axe = plt.subplots()
axe.hist(readability_adj, 10)
axe.set_xlabel('Readability score')
axe.set_ylabel('Number of answers')
plt.show()

In [None]:
easy_to_read = df_biggame['answer'][readability.argsort()[::1]][:5].values
hard_to_read = df_biggame['answer'][readability.argsort()[::-1]][:5].values

print('Mean Readability Score: {0:.1f}'.format(readability.mean()))
print('Std Readability Score: {0:.1f}'.format(readability.std()))
print('Easiest to Read:')
print(easy_to_read)
print('Hardest to read')
print(hard_to_read)

In [None]:
unique_participant_ids, answers_per_participant = np.unique(df_biggame['participant_id'], return_counts=True)

unique_participant_idxs = [np.where(df_biggame['participant_id'] == u)[0] for u in unique_participant_ids]

In [None]:
readability_per_participant = np.array([np.median(readability[uix]) for uix in unique_participant_idxs])

fig, ax = plt.subplots()
ax.hist(readability_per_participant)
plt.show()

In [None]:
participant_info = pd.read_csv('./data/participants_profiles.csv')
print(participant_info.columns)

playing_piano_per_participant = np.zeros((len(unique_participant_ids)))
education_per_participant = np.zeros(len(unique_participant_ids))
listening_classical_music_per_participant = np.zeros(len(unique_participant_ids))

for med, clm, pia, pid in zip(participant_info.music_education_years, 
                              participant_info.listening_to_classical_music,
                              participant_info.playing_piano,
                              participant_info.participant_id):
    pix = np.where(unique_participant_ids == pid)[0]
    education_per_participant[pix] = np.nan_to_num(med)
    playing_piano_per_participant[pix] = np.nan_to_num(pia)
    listening_classical_music_per_participant[pix] = np.nan_to_num(clm)

In [None]:
lm = LinearModel()

y = readability_per_participant
x = education_per_participant

outlier_idxs = np.where(x < x.mean() + 2.5 * x.std())[0]
x = x[outlier_idxs]
y = y[outlier_idxs]
# print(outlier_idxs)
stats,_, y_hat = lm.test(x, y, 
                hypothesis_type=['gt', 'equal'], test_type='wald', return_preds=True,
                      significance_level=0.05)
print('R2:', cod(y, y_hat), correlation(y, y_hat), correlation(x, y))
for p,st in zip(lm.params, stats):
    print(p, st, st.reject_h0)
    
fig, ax = plt.subplots()
ax.scatter(x, y)
ax.plot(x, y_hat, c='black', linewidth=2)
ax.set_ylabel('Dale-Chall readability score')
ax.set_xlabel('Years of Musical Education')
plt.show()

In [None]:
groups = [y[x<5], y[x>=5], y[x > 10]]

anova = f_oneway(*groups)
print(anova)

fig, ax = plt.subplots()
ax.violinplot(groups, showmedians=True)
plt.show()

In [None]:
lm = LinearModel()

y = readability_per_participant
x = playing_piano_per_participant

outlier_idxs = np.where(x < x.mean() + 2.5 * x.std())[0]
x = x[outlier_idxs]
y = y[outlier_idxs]
# print(outlier_idxs)
stats,_, y_hat = lm.test(x, y, 
                hypothesis_type=['gt', 'equal'], test_type='wald', return_preds=True,
                      significance_level=0.05)
print('R2:', cod(y, y_hat), correlation(y, y_hat))
for p,st in zip(lm.params, stats):
    print(p, st, st.reject_h0)
    
fig, ax = plt.subplots()
ax.scatter(x, y)
ax.plot(x, y_hat, c='black', linewidth=2)
ax.set_ylabel('Dale-Chall readability score')
ax.set_xlabel('Experience Playing Piano')
plt.show()

In [None]:
lm = LinearModel()

y = readability_per_participant
x = listening_classical_music_per_participant

outlier_idxs = np.where(x < x.mean() + 2.5 * x.std())[0]
x = x[outlier_idxs]
y = y[outlier_idxs]
# print(outlier_idxs)
stats, _, y_hat = lm.test(x, y, 
                hypothesis_type=['gt', 'equal'], test_type='wald', return_preds=True,
                      significance_level=0.05)
print('R2:', cod(y, y_hat), correlation(y, y_hat), spstats.spearmanr(x,y))
for p,st in zip(lm.params, stats):
    print(p, st, st.reject_h0)
    
fig, ax = plt.subplots()
ax.scatter(x, y)
ax.plot(x, y_hat, c='black', linewidth=2)
ax.set_ylabel('Dale-Chall readability score')
ax.set_xlabel('Listening Classical Music')
plt.show()

In [None]:
 listening_options =['Never',
 'Very rarely',
 'Rarely',
 'Occasionally',
 'Frequently',
 'Very frequently']
    
groups = [y[x<2], y[np.logical_and(x>=2, x<4)], y[x>=4]]

print(f_oneway(*groups))

fig, ax = plt.subplots()
ax.violinplot(groups, showmedians=True)
pos = [1, 2, 3]
ax.set_xticks(pos)
ax.set_xticklabels(['Never/very rarely',
                   'Occasionally',
                   'Frequently'])
ax.set_ylabel('Dale-Chall readability score')
ax.set_xlabel('Listening to classical music')
plt.show()
# participant_info.listening_to_classical_music


In [None]:
readability_per_participant[np.logical_and(listening_classical_music_per_participant >4, 
                                           education_per_participant > 10,
                                          )]
               
# playing_piano_per_participant 
# education_per_participant 
# listening_classical_music_per_participant 
