In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts

In [5]:
import sys

sys.path.append('/media/fba/MIG-MusicPerformanceAnalysis-Code/src/data_parse/scripts')

In [30]:
import os

In [11]:
from assessments_scores import organize_assessment as oa

In [28]:
train_size = 0.7
val_size = 0.2
test_size = 0.1

split_path = '/media/fba/MIG-MusicPerformanceAnalysis-Code/src/split/canonical'

seed = 42

In [26]:
years = range(2013, 2018+1)
bands = ['middle', 'concert', 'symphonic']

In [76]:
def map_rare_instruments(inst):
    
    
    if inst in ['Eb Clarinet']:
        return 'Bb Clarinet'
    
    if inst in ['Bb Contrabass Clarinet', 'Eb Contra Alto Clarinet', 'Bass Clarinet']:
        return 'Bass Clarinet'

#     if inst in ['Piccolo']:
#         return 'Flute'
    
    if inst in ['English Horn']:
        return 'Oboe'
    
    if inst in ['Contrabassoon']:
        return 'Bassoon'
    
    if inst in ['Piano']:
        return 'Percussion'
    
    if inst in ['Soprano Sax']:
        return 'Alto Saxophone'
    
    return inst


In [122]:
def get_score_quantile_by_instrument(df):
    
    dfsi = df[['Student', 'Instrument']].drop_duplicates()
    
    mean_score =  df.groupby('Student')['NormalizedScore'].mean()
    
    dfsi['MeanScore'] = dfsi['Student'].apply(lambda x: mean_score[x])
    
    dfg = dfsi.groupby('Instrument')
    
    quartiles = {}
    
    for inst, dfgi in dfg:
        quartile = dfgi['MeanScore'].quantile([0.25, 0.5, 0.75])
        quartiles[inst] = quartile.values
        
#     print(quartiles)
        
    def get_quartile(inst, ms):
        if ms <= quartiles[inst][0]:
            return 1
        elif ms <= quartiles[inst][1]:
            return 2
        elif ms <= quartiles[inst][2]:
            return 3
        else:
            return 4
    
    
    dfsi['Quartile'] = dfsi.apply(lambda row: get_quartile(row.Instrument, row.MeanScore), axis=1)
    
    dfsi['Midtile'] = dfsi['Quartile'].apply(lambda x: x > 2)
        
    return dfsi

# dfsi = get_score_quantile_by_instrument(df)

In [126]:
for year in years:
    for band in bands:
        
        print(year, band)

        df = oa.read_normalized_csv('/media/fba', year, band)
        dfsi = get_score_quantile_by_instrument(df)
        # dfsi = df[['Student', 'Instrument']].drop_duplicates()

        dfsi['InstrumentGroupFine'] = dfsi['Instrument'].apply(map_rare_instruments)
        
        for strat in [
            ['Instrument', 'Quartile'],
            ['Instrument', 'Midtile'],
            ['InstrumentGroupFine', 'Quartile'],
            ['InstrumentGroupFine', 'Midtile'],
            ['Instrument'],
            ['InstrumentGroupFine']
        ]:
            try:
                train_id, valtest_id = tts(dfsi.Student.values, train_size=train_size, random_state=seed, stratify=dfsi[strat])
                train_valtest_strat_mode = strat
                break
            except:
                pass

        valdfsi = dfsi.set_index('Student').loc[valtest_id]
        
        for strat in [
            ['Instrument', 'Quartile'],
            ['Instrument', 'Midtile'],
            ['InstrumentGroupFine', 'Quartile'],
            ['InstrumentGroupFine', 'Midtile'],
            ['Instrument'],
            ['InstrumentGroupFine']
        ]:
            try:
                val_id, test_id = tts(valtest_id, train_size=val_size/(val_size + test_size), random_state=seed, stratify=valdfsi[strat])
                val_test_strat_mode = strat
                break
            except:
                pass
            
        print("train:valtest strat:", train_valtest_strat_mode)
        print("val:test strat:", val_test_strat_mode)

        os.makedirs(f'{split_path}/{year}/{band}', exist_ok=True)
        
        np.save(f'{split_path}/{year}/{band}/train.npy', train_id)
        np.save(f'{split_path}/{year}/{band}/valtest.npy', valtest_id)
        np.save(f'{split_path}/{year}/{band}/val.npy', val_id)
        np.save(f'{split_path}/{year}/{band}/test.npy', test_id)

2013 middle
train:valtest strat: ['Instrument', 'Midtile']
val:test strat: ['Instrument']
2013 concert
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine', 'Midtile']
2013 symphonic
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine', 'Midtile']
2014 middle
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['Instrument', 'Midtile']
2014 concert
train:valtest strat: ['Instrument', 'Quartile']
val:test strat: ['Instrument', 'Midtile']
2014 symphonic
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine', 'Midtile']
2015 middle
train:valtest strat: ['InstrumentGroupFine', 'Midtile']
val:test strat: ['Instrument']
2015 concert
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine', 'Midtile']
2015 symphonic
train:valtest strat: ['InstrumentGroupFine', 'Quartile']
val:test strat: ['InstrumentGroupFine