In [40]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [41]:
df_ds = pd.read_excel('final_predictions/deepseek14_all_pred.xlsx', index_col=0)
df_llama = pd.read_excel('final_predictions/llama8_all_pred.xlsx', index_col=0)
df_gemma = pd.read_excel('final_predictions/gemma12_all_pred.xlsx', index_col=0)

In [42]:
col_labels = [col for col in df_ds.columns if col.startswith('label')]

In [43]:
def merge_preds(df_ds, df_llama, df_gemma, task, column_label, column_real):
    
    df = pd.merge(pd.merge(df_ds[df_ds['Task'] == task][['RequirementText',column_real, column_label]], df_llama[df_llama['Task'] == task][['RequirementText',column_real,column_label]], 
            how = 'inner', on = ['RequirementText',  column_real], suffixes=('_DS', '_LM'))
            , df_gemma[df_gemma['Task'] == task][['RequirementText',column_real, column_label]], how = 'inner', on = ['RequirementText', column_real])

    df.rename(columns={column_label: column_label+ '_GM'}, inplace=True)

    return df

In [None]:
majority_votes ={}
all_scores = {'f1': [], 'precision': [], 'recall': [], 'specificity':[]}

for dataset in ['l', 'p', 'rds', 'o']:
    for task in ['f','q']:
        
        
        for i in [1,2,4,8,16,32,64]: 
            column_label = 'label_' + str(i)

            if task == 'f': column_real = 'IsFunctional'
            else: column_real = 'IsQuality'
            df1 = merge_preds(df_ds[df_ds['Dataset'] == dataset], df_llama[df_llama['Dataset'] == dataset], df_gemma[df_gemma['Dataset'] == dataset], task, column_label, column_real)
            pred = df1[[column_label+m_name for m_name in ['_DS', '_LM', '_GM']]].mode(axis=1)[0].to_list()
            majority_votes[column_label] = pred

            tn, fp, fn, tp = confusion_matrix(df1[column_real], pred).ravel()

            precision = precision_score(df1[column_real], pred)
            recall = recall_score(df1[column_real], pred)
            f1 = f1_score(df1[column_real], pred)
            all_scores['specificity'].append({'task': task, 'size': i, 'dataset': dataset,'score': tn / (tn+fp)})
            all_scores['f1'].append({'size': i, 'task': task, 'score': f1, 'dataset': dataset})
            all_scores['precision'].append({'size': i, 'task': task, 'score': precision, 'dataset': dataset})
            all_scores['recall'].append({'size': i, 'task': task, 'score': recall, 'dataset': dataset})

In [45]:
maj_f1 = pd.DataFrame(all_scores['f1'])
maj_precision = pd.DataFrame(all_scores['precision'])
maj_recall = pd.DataFrame(all_scores['recall'])
maj_spec = pd.DataFrame(all_scores['specificity'])
maj_f1['model'] = ['ensemble'] *len(maj_f1)
maj_precision['model'] = ['ensemble'] *len(maj_precision)
maj_recall['model'] = ['ensemble'] *len(maj_recall)
maj_spec['model'] = ['ensemble'] *len(maj_spec)

In [47]:
df_precision = pd.read_excel('final_predictions/all_precision_score.xlsx', index_col=[0])
df_recall = pd.read_excel('final_predictions/all_recall_score.xlsx', index_col=[0])
df_f1 = pd.read_excel('final_predictions/all_f1_score.xlsx', index_col=[0])
df_spec = pd.read_excel('final_predictions/all_specificity_score.xlsx', index_col=[0])

In [None]:
def get_metrics_table(df, df_maj, task = 'q'):
    df = df[df_maj.columns]
    df = pd.concat([df, df_maj])
    grouped_metrics =df[df['task'] == task][['size', 'model', 'score', 'task', 'dataset']].groupby(['size', 'model'])['score'].agg({'mean', 'std'})
    grouped_metrics.columns = ['_'.join(col) if isinstance(col, tuple) else col for col in grouped_metrics.columns]

    grouped = round(grouped_metrics,3).reset_index()
    pivot_df = grouped.pivot(index='size', columns='model', values=['mean', 'std'])
    pivot_df.columns = pivot_df.columns.swaplevel(0, 1)
    pivot_df = pivot_df.sort_index(axis=1, level=0)
    return pivot_df


In [81]:
column_order = [('deepseek14','mean'),
            ('deepseek14', 'std'),
            ('gemma12','mean'),
            ('gemma12', 'std'),
            ('llama8','mean'),
            ('llama8', 'std'), 
            ('ensemble','mean'),
            ('ensemble','std')]

In [51]:
table_specificity = get_metrics_table(df_spec, maj_spec)

In [53]:
table_precision = get_metrics_table(df_precision, maj_precision)

In [57]:
table_recall = get_metrics_table(df_recall, maj_recall)

In [84]:
with pd.ExcelWriter('tables_compare_ensemble.xlsx', engine='xlsxwriter') as writer:
    table_specificity[column_order].to_excel(writer, sheet_name='Specificity')
    table_precision[column_order].to_excel(writer, sheet_name='Precision')
    table_recall[column_order].to_excel(writer, sheet_name='Recall')
