In [None]:
from rsmtool.analysis import compute_metrics
from rsmtool.form_level_scores import compute_form_level_predictions
from rsmtool.tpo_scores import convert_tpo_scores 
from rsmtool.utils import write_experiment_output

In [None]:
# check to make sure we have the information about candidate
skip_section = 'candidate' not in df_test_metadata.columns

if skip_section:
    display(Markdown("Section skipped because the data did not contain a column with candidate ID."))
else:
    display(Markdown('## Form-level evaluations using TOEFLiBT/TPO aggregation rules'))
    display(Markdown("This section shows the form-level (speaker-level) evaluations on the evaluation set. "
                     "Note that for system scores (as elsewhere in this report) 'scale' refers to scores "
                     "that have been re-scaled to the range of human scores. This is different from "
                     "re-scaling the final scores to 1-30 range which is done using a look-up table "
                     "at a later stage"))
    
    # merge the data and compute form level predictions for system scores
    
    df_pred_with_metadata = pd.merge(df_pred_preproc, df_test_metadata)
    df_form_level_predictions = compute_form_level_predictions(df_pred_with_metadata, 'TPO_speaking')

    # determine which predictions we are using
    if use_scaled_predictions:
        system_score = 'scale'
    else:
        system_score = 'raw'
        
    # select speakers with valid numeric score
    df_valid_predictions = df_form_level_predictions.ix[df_form_level_predictions[system_score].notnull()]
    
    score_columns = [system_score, 'sc1']
    
    if not df_valid_predictions.empty:
    
        # if we have second human score, compute the second form-level score
        if second_human_score_column:
            df_human_scores_with_metadata = pd.merge(df_test_human_scores, df_test_metadata)

            # only select responses with valid sc2
            df_human_scores_which_have_sc2 = df_human_scores_with_metadata.ix[df_human_scores_with_metadata['sc2'].notnull()]
            df_human_form_level_predictions = compute_form_level_predictions(df_human_scores_which_have_sc2,
                                                                             'TPO_speaking',
                                                                              human_scores_only=True)
            # select candidates which have form-level score for sc2
            df_valid_human_predictions = df_human_form_level_predictions.ix[df_human_form_level_predictions['sc2'].notnull()]
        else:
            df_human_form_level_predictions = pd.DataFrame()
            df_valid_human_predictions = pd.DataFrame()


        # if we have valid human predictions, add them to system predictions and compute evaluations
        if not df_valid_human_predictions.empty:

            df_valid_human_and_system_predictions = pd.merge(df_valid_predictions, 
                                                             df_valid_human_predictions[['candidate', 'sc2']], 
                                                             how='left')
            score_columns.append('sc2')
            (df_human_machine_eval, 
             _, 
             df_human_human_eval) = compute_metrics(df_valid_human_and_system_predictions[score_columns],
                                                    include_second_score=True)
        else:
            (df_human_machine_eval, 
             _, 
             df_human_human_eval) = compute_metrics(df_valid_predictions[score_columns],
                                                    include_second_score=False)


        df_human_machine_eval.insert(0, 'score', df_human_machine_eval.index)
        df_human_human_eval.insert(0, 'score', ['human'])

        # compute separate evaluation only for speakers who had scores for all six items

        df_clean_predictions = df_valid_predictions.ix[df_valid_predictions['numeric_scores'] == 6]

        # if we have a second score, find speakers who had 6 numeric scores from both raters and add them to 
        # system predictions
        if second_human_score_column:
            df_clean_human_predictions = df_valid_human_predictions[df_valid_human_predictions['numeric_scores'] == 6]
            df_clean_human_and_system_predictions = pd.merge(df_clean_predictions, 
                                                             df_clean_human_predictions[['candidate', 'sc2']],
                                                             how = 'left')
            (df_human_machine_clean_eval, 
             _, 
             df_human_human_clean_eval) = compute_metrics(df_clean_human_and_system_predictions[score_columns],
                                                          include_second_score=True)
        else:
            df_clean_human_and_system_predictions = df_clean_predictions
            (df_human_machine_clean_eval, 
             _, 
             df_human_human_clean_eval) = compute_metrics(df_clean_human_and_system_predictions[score_columns],
                                                          include_second_score=False)

        # compute evaluation metrics
        df_human_machine_clean_eval.insert(0, 'score', df_human_machine_clean_eval.index)
        df_human_human_clean_eval.insert(0, 'score', ['human'])
    
      
        # display the results

        markdown_strs = ['### No imputation']
        markdown_strs.append("The following table shows the agreement for all speakers who "
                             "received numeric scores for all 6 items.")

        if second_human_score_column:
            markdown_strs.append('#### Human-human agreement')
            if not df_human_human_clean_eval['N'].isnull().values.all():
                display(Markdown('\n'.join(markdown_strs)))
                display(HTML(df_human_human_clean_eval.to_html(index=False,
                                               escape=False,
                                               float_format=float_format_func)))
            else:
                markdown_strs.append("None of the speakers who had 6 numeric system and human scores also had "
                                     " 6 numeric scores from the second rater.")
                display(Markdown('\n'.join(markdown_strs)))

                # set the data frame to empty so that it is not saved
                df_human_human_clean_eval = pd.DataFrame()
        else:
            display(Markdown('\n'.join(markdown_strs)))

        markdown_strs = ['#### Human-system agreement']     
        if len(df_clean_predictions) > 0:
            display(Markdown('\n'.join(markdown_strs)))
            display(HTML(df_human_machine_clean_eval.to_html(index=False,
                                           escape=False,
                                           float_format=float_format_func)))
        else:
            markdown_strs.append("None of the speakers had 6 numeric system and human scores")
            # set the data frame to empty so that it is not saved
            df_human_machine_clean_eval = pd.DataFrame()


        markdown_strs = ['### All data']
        markdown_strs.append("The following table shows the agreement for all speakers "
                             " who received 5 or more "
                             "numeric scores. The missing scores were imputed using median "
                             "value of the other 5 scores.")

        if second_human_score_column:
            markdown_strs.append('#### Human-human agreement')
            if not df_human_human_eval['N'].isnull().values.all():
                display(Markdown('\n'.join(markdown_strs)))
                display(HTML(df_human_human_eval.to_html(index=False,
                                                     escape=False,
                                                     float_format=float_format_func)))
            else:
                markdown_strs.append("None of the speakers who had 5 or more numeric system and human scores also "
                                     " had 5 or more numeric scores from the second rater.")
                display(Markdown('\n'.join(markdown_strs)))
                df_human_human_eval = pd.DataFrame()
        else:
            display(Markdown('\n'.join(markdown_strs)))

        markdown_strs = ['#### Human-system agreement']  
        display(Markdown('\n'.join(markdown_strs)))
        display(HTML(df_human_machine_eval.to_html(index=False,
                                                   escape=False,
                                                   float_format=float_format_func)))
        # write out the outputs
        write_experiment_output([df_form_level_predictions,
                                 df_human_form_level_predictions,
                                 df_human_human_clean_eval,
                                 df_human_machine_clean_eval,
                                 df_human_human_eval,
                                 df_human_machine_eval],
                                ['pred_form_level',
                                 'test_human_scores_form_level',
                                 'consistency_form_level_no_imputation',
                                 'eval_form_level_no_imputation',
                                 'consistency_form_level',
                                 'eval_form_level'], 
                               experiment_id,
                               output_dir)
    else:
        display(Markdown("None of the speakers had 5 or more numeric system and human scores necessary to "
                         "compute form-level scores") )
        write_experiment_output([df_form_level_predictions],
                                ['pred_form_level'], 
                               experiment_id,
                               output_dir)
         
    