#This script is to test my hypotheses

I am back-generating synthetic ROUGE scores for both my FLAN-T5 (tuned on Olympics dataset), and my baseline gpt2-large (not fine-tuned on any data)

In [7]:
import numpy as np
from scipy.stats import ttest_ind
import pandas as pd

In [2]:
baseline_means = {'ROUGE1': 0.0505, 'ROUGE2': 0.0342, 'ROUGEL': 0.0505}
flan_t5_means = {'ROUGE1': 0.9480, 'ROUGE2': 0.6381, 'ROUGEL': 0.9479}

In [6]:
### setting up the synthetic data

variance = 0.02

np.random.seed(42)
num_samples = 100


baseline_scores = {metric: np.random.normal(mean, np.sqrt(variance), num_samples) for metric, mean in baseline_means.items()}
flan_t5_scores = {metric: np.random.normal(mean, np.sqrt(variance), num_samples) for metric, mean in flan_t5_means.items()}

The actual hypothesis testing! Getting mean, variance, and mean squared error for the two models. Obtaining t-statistic and p-value, saving results with labels.

In [8]:
results = {}

for metric in baseline_means.keys():
  baseline = baseline_scores[metric]
  flan_t5 = flan_t5_scores[metric]

  mean_baseline = np.mean(baseline)
  mean_flan_t5 = np.mean(flan_t5)
  var_baseline = np.var(baseline)
  var_flan_t5 = np.var(flan_t5)
  mse = np.mean((flan_t5 - baseline) ** 2)

  t_stat, p_val = ttest_ind(flan_t5, baseline, alternative='greater')

  results[metric] = {
      'Mean Baseline': mean_baseline,
      'Mean Flan-T5': mean_flan_t5,
      'Variance Baseline': var_baseline,
      'Variance Flan-T5': var_flan_t5,
      'MSE': mse,
      't-statistic': t_stat,
      'p-value': p_val
  }


In [9]:
df_results = pd.DataFrame(results).T
print(df_results)

        Mean Baseline  Mean Flan-T5  Variance Baseline  Variance Flan-T5  \
ROUGE1       0.035814      0.963109           0.016330          0.015476   
ROUGE2       0.037354      0.630180           0.018008          0.022404   
ROUGEL       0.059678      0.931593           0.023278          0.016888   

             MSE  t-statistic        p-value  
ROUGE1  0.897096    51.733951  2.760721e-117  
ROUGE2  0.384190    29.341993   2.532560e-74  
ROUGEL  0.806432    43.287470  3.349077e-103  
