In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts

In [2]:
import sys

sys.path.append('/media/fba/MIG-MusicPerformanceAnalysis-Code/src/data_parse/scripts')

In [3]:
import os

In [4]:
from assessments_scores import organize_assessment as oa

In [5]:
train_size = 0.7
val_size = 0.2
test_size = 0.1

split_path = '/media/fba/MIG-MusicPerformanceAnalysis-Code/src/split/canonical'

seed = 42

In [6]:
years = range(2013, 2018+1)
bands = ['middle', 'concert', 'symphonic']

In [7]:
def map_rare_instruments(inst):
    
    
    if inst in ['Eb Clarinet']:
        return 'Bb Clarinet'
    
    if inst in ['Bb Contrabass Clarinet', 'Eb Contra Alto Clarinet', 'Bass Clarinet']:
        return 'Bass Clarinet'

#     if inst in ['Piccolo']:
#         return 'Flute'
    
    if inst in ['English Horn']:
        return 'Oboe'
    
    if inst in ['Contrabassoon']:
        return 'Bassoon'
    
    if inst in ['Piano']:
        return 'Percussion'
    
    if inst in ['Soprano Sax']:
        return 'Alto Saxophone'
    
    return inst


In [8]:
def get_score_quantile_by_instrument(df):
    
    dfsi = df[['Student', 'Instrument']].drop_duplicates()
    
    mean_score =  df.groupby('Student')['NormalizedScore'].mean()
    
    dfsi['MeanScore'] = dfsi['Student'].apply(lambda x: mean_score[x])
    
    dfg = dfsi.groupby('Instrument')
    
    quartiles = {}
    
    for inst, dfgi in dfg:
        quartile = dfgi['MeanScore'].quantile([0.25, 0.5, 0.75])
        quartiles[inst] = quartile.values
        
#     print(quartiles)
        
    def get_quartile(inst, ms):
        if ms <= quartiles[inst][0]:
            return 1
        elif ms <= quartiles[inst][1]:
            return 2
        elif ms <= quartiles[inst][2]:
            return 3
        else:
            return 4
    
    
    dfsi['Quartile'] = dfsi.apply(lambda row: get_quartile(row.Instrument, row.MeanScore), axis=1)
    
    dfsi['Midtile'] = dfsi['Quartile'].apply(lambda x: x > 2)
        
    return dfsi

# dfsi = get_score_quantile_by_instrument(df)

{'Alto Saxophone': [93095, 89614, 91740, 93217, 88880, 87645, 87911, 89354, 90175, 88924, 87551, 86916, 94803, 89023, 91857, 91288, 87646, 89088, 89814, 87223, 89258, 89355, 90180, 92085, 86310, 91732, 87507, 89959, 89749, 92463, 92394, 86627, 92064, 87419, 86346, 89644, 88854, 88799, 93889, 92470, 89101, 86649, 87422, 91774, 91191, 91405, 86931, 87378, 89659, 90904, 92990, 86124, 89048, 91869, 87228, 87964, 89563, 87067, 91297, 86621, 88886, 86915, 93429, 89548, 87939, 88710, 87018, 90573, 87424, 91055, 91770, 91995, 90008, 90176, 87229, 87644, 91861, 90929, 91932, 87132, 92099, 86396, 93840, 95626, 92024, 89764, 88715, 94500, 89880, 93263, 90761, 89071, 92807, 93206, 95082, 88841, 94420, 86307, 89807, 91164], 'Bari Saxophone': [95751, 86889, 90705, 92182, 88038, 90792, 90478, 93240, 88153, 87580, 89816, 87605, 87436, 90766, 89090, 86591], 'Bass Clarinet': [86459, 91317, 87523, 93677, 94093, 92526, 92308, 91840, 87219, 87568, 86166, 91537, 86390, 90005, 94317, 89021, 92043, 90589, 920

In [9]:
for year in years:
    for band in bands:
        
        print(year, band)

        df = oa.read_normalized_csv('/media/fba', year, band)
        dfsi = get_score_quantile_by_instrument(df)
        # dfsi = df[['Student', 'Instrument']].drop_duplicates()

        dfsi['InstrumentGroupFine'] = dfsi['Instrument'].apply(map_rare_instruments)
        
        inst_grps = dfsi.InstrumentGroupFine.unique().tolist()
        
        for strat in [
            ['Instrument', 'Quartile'],
            ['Instrument', 'Midtile'],
            ['InstrumentGroupFine', 'Quartile'],
            ['InstrumentGroupFine', 'Midtile'],
            ['Instrument'],
            ['InstrumentGroupFine']
        ]:
            try:
                train_id, valtest_id = tts(dfsi.Student.values, train_size=train_size, random_state=seed, stratify=dfsi[strat])
                train_valtest_strat_mode = strat
                break
            except:
                pass
            

        valdfsi = dfsi.set_index('Student').loc[valtest_id]
        
        for strat in [
            ['Instrument', 'Quartile'],
            ['Instrument', 'Midtile'],
            ['InstrumentGroupFine', 'Quartile'],
            ['InstrumentGroupFine', 'Midtile'],
            ['Instrument'],
            ['InstrumentGroupFine']
        ]:
            try:
                val_id, test_id = tts(valtest_id, train_size=val_size/(val_size + test_size), random_state=seed, stratify=valdfsi[strat])
                val_test_strat_mode = strat
                break
            except:
                pass
            
        print("train:valtest strat:", train_valtest_strat_mode)
        print("val:test strat:", val_test_strat_mode)
        
        
        train_id_by_inst = dfsi.set_index('Student').loc[train_id].groupby('InstrumentGroupFine').groups
        valtest_id_by_inst = dfsi.set_index('Student').loc[valtest_id].groupby('InstrumentGroupFine').groups
        val_id_by_inst = dfsi.set_index('Student').loc[val_id].groupby('InstrumentGroupFine').groups
        test_id_by_inst = dfsi.set_index('Student').loc[test_id].groupby('InstrumentGroupFine').groups
        

        os.makedirs(f'{split_path}/{year}/{band}', exist_ok=True)
        
        np.save(f'{split_path}/{year}/{band}/train-all.npy', train_id)
        np.save(f'{split_path}/{year}/{band}/valtest-all.npy', valtest_id)
        np.save(f'{split_path}/{year}/{band}/val-all.npy', val_id)
        np.save(f'{split_path}/{year}/{band}/test-all.npy', test_id)
        
        for ig in train_id_by_inst:
            np.save(f'{split_path}/{year}/{band}/train-{ig.replace(" ", "")}.npy', train_id)
        for ig in train_id_by_inst:
            np.save(f'{split_path}/{year}/{band}/valtest-all.npy', valtest_id)
        for ig in train_id_by_inst:
            np.save(f'{split_path}/{year}/{band}/val-all.npy', val_id)
        for ig in train_id_by_inst:
            np.save(f'{split_path}/{year}/{band}/test-all.npy', test_id)

2013 middle
train:valtest strat: ['Instrument', 'Midtile']
val:test strat: ['Instrument']
2013 concert
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine', 'Midtile']
2013 symphonic
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine', 'Midtile']
2014 middle
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['Instrument', 'Midtile']
2014 concert
train:valtest strat: ['Instrument', 'Quartile']
val:test strat: ['Instrument', 'Midtile']
2014 symphonic
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine', 'Midtile']
2015 middle
train:valtest strat: ['InstrumentGroupFine', 'Midtile']
val:test strat: ['Instrument']
2015 concert
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine', 'Midtile']
2015 symphonic
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine