In [1]:
import time

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm 
from datasets import load_dataset

from sentence_transformers import SentenceTransformer

import seaborn as sns
sns.set(style='darkgrid')
sns.set_palette(sns.color_palette('husl', 8))

import matplotlib.pyplot as plt
%matplotlib inline

params = {
    'legend.fontsize': 'x-large',
    'figure.figsize': (16, 5),
    'axes.labelsize': 'x-large',
    'axes.titlesize':'x-large',
    'xtick.labelsize':'x-large',
    'ytick.labelsize':'x-large',
}
plt.rcParams.update(params)

from src.loading import huggingface_dataset_to_dataframes
from src.preprocessing import Preprocessor
from src.feature_engineering import FeatureEngineering
from src.bias import Factors

from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

%load_ext autoreload
%autoreload 2

In [24]:
pd.set_option('display.max_columns', None)

In [2]:
# 1. Load human learning datasets
test_axis, validation_axis = (
    pd.read_pickle("data/raw/test_axis.pkl"),
    pd.read_pickle("data/raw/validation_axis.pkl"),
)
train_comparisons, validation_comparisons = (
    pd.read_pickle("data/raw/train_comparisons.pkl").iloc[:6000],
    pd.read_pickle("data/raw/validation_comparisons.pkl").iloc[:6000],
)

In [3]:
# 2. Pre-processing
preprocessor = Preprocessor()

train_comparisons = preprocessor.preprocessing_pipeline(train_comparisons)
validation_comparisons = preprocessor.preprocessing_pipeline(validation_comparisons)

test_axis = preprocessor.preprocessing_pipeline(test_axis)
validation_axis = preprocessor.preprocessing_pipeline(validation_axis)

In [4]:
start_time = time.time()

# 3. Feature engineering
fe = FeatureEngineering()
encoder = SentenceTransformer('all-MiniLM-L6-v2')

train_comparisons = fe.fe_pipeline(train_comparisons, "comparisons", encoder) 
validation_comparisons = fe.fe_pipeline(validation_comparisons, "comparisons", encoder)
                                                
test_axis = fe.fe_pipeline(test_axis, "axis", encoder) 
validation_axis = fe.fe_pipeline(validation_axis, "axis", encoder)

print("--- %s seconds ---" % (time.time() - start_time))



Batches:   0%|          | 0/94 [00:00<?, ?it/s]

Batches:   0%|          | 0/94 [00:00<?, ?it/s]

Batches:   0%|          | 0/94 [00:00<?, ?it/s]

Batches:   0%|          | 0/94 [00:00<?, ?it/s]

Batches:   0%|          | 0/94 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/94 [00:00<?, ?it/s]

Batches:   0%|          | 0/94 [00:00<?, ?it/s]

Batches:   0%|          | 0/99 [00:00<?, ?it/s]

Batches:   0%|          | 0/99 [00:00<?, ?it/s]

Batches:   0%|          | 0/99 [00:00<?, ?it/s]

Batches:   0%|          | 0/135 [00:00<?, ?it/s]

Batches:   0%|          | 0/134 [00:00<?, ?it/s]

Batches:   0%|          | 0/135 [00:00<?, ?it/s]

--- 293.6947798728943 seconds ---


In [40]:
train_comparisons

Unnamed: 0,worker,batch,split,id,source,subsource,title,text,summary_0,policy_0,note_0,summary_1,policy_1,note_1,confidence,choice,ref_summary,m0_rouge_1_f,m0_rouge_2_f,m0_rouge_l_f,m1_rouge_1_f,m1_rouge_2_f,m1_rouge_l_f,m0_bleu,m1_bleu,m0_flesch_reading_ease,m0_syllable_count,m0_lexicon_count,m0_sentence_count,m0_char_count,m0_letter_count,m0_polysyllab_count,m0_monosyllab_count,m1_flesch_reading_ease,m1_syllable_count,m1_lexicon_count,m1_sentence_count,m1_char_count,m1_letter_count,m1_polysyllab_count,m1_monosyllab_count,m0_compression_ratio,m0_jaccard_similarity_1,m0_jaccard_similarity_2,m1_compression_ratio,m1_jaccard_similarity_1,m1_jaccard_similarity_2,m0_text_summary_xfmr_similarity,m1_text_summary_xfmr_similarity,m0_ref_summary_xfmr_similarity,m1_ref_summary_xfmr_similarity
0,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,Mum is mad at me for not flying on my own trip...,sup1,,I have made sure my mother is comfortable with...,sup1,,,1,mum isn't speaking to me because I booked a fl...,0.363636,0.181818,0.303030,0.368421,0.100000,0.210526,0.057345,0.046340,97.54,17.0,16.0,1.0,53.0,52.0,0.0,15.0,71.48,32.0,25.0,1.0,102.0,101.0,2.0,20.0,0.051750,0.107914,0.030418,0.095890,0.111888,0.029630,0.715528,0.696612,0.770994,0.665529
1,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,I have made sure my mother is comfortable with...,sup1,,mum isn't speaking to me because I booked a fl...,ref,,,1,mum isn't speaking to me because I booked a fl...,0.368421,0.100000,0.210526,1.000000,1.000000,1.000000,0.046340,1.000000,71.48,32.0,25.0,1.0,102.0,101.0,2.0,20.0,77.57,24.0,19.0,1.0,74.0,71.0,0.0,14.0,0.095890,0.111888,0.029630,0.070015,0.136691,0.045455,0.696612,0.640241,0.665529,1.000000
2,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,mum isn't speaking to me because I booked a fl...,ref,,Mum thought I was going to road trip with my b...,sup1,,,0,mum isn't speaking to me because I booked a fl...,1.000000,1.000000,1.000000,0.325581,0.000000,0.279070,1.000000,0.008724,77.57,24.0,19.0,1.0,74.0,71.0,0.0,14.0,97.20,38.0,32.0,4.0,136.0,132.0,1.0,27.0,0.070015,0.136691,0.045455,0.127093,0.154930,0.039568,0.640241,0.727417,1.000000,0.793905
3,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,Mum thought I was going to road trip with my b...,sup1,,Mum is mad at me for not flying on my own trip...,sup1,,,0,mum isn't speaking to me because I booked a fl...,0.325581,0.000000,0.279070,0.363636,0.181818,0.303030,0.008724,0.057345,97.20,38.0,32.0,4.0,136.0,132.0,1.0,27.0,97.54,17.0,16.0,1.0,53.0,52.0,0.0,15.0,0.127093,0.154930,0.039568,0.051750,0.107914,0.030418,0.727417,0.715528,0.793905,0.770994
4,LjvoXOAj5op3WqNnn5b7TZTG8mK7gM,batch3,train,t3_1zwek5,reddit,AskReddit,Can I sue my property management company and l...,My landlord left a falsified message taped to ...,My landlord is harassing me and my neighbours ...,sup1,,landlord pretended to be another tenant and wr...,ref,,,1,landlord pretended to be another tenant and wr...,0.137931,0.000000,0.137931,1.000000,1.000000,1.000000,0.007511,1.000000,70.63,48.0,35.0,2.0,157.0,155.0,4.0,26.0,48.81,47.0,28.0,2.0,145.0,143.0,7.0,17.0,0.120353,0.129213,0.030100,0.108381,0.113636,0.037931,0.803380,0.603939,0.538846,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_mblze,reddit,relationships,Self conscious about sex life - what to do?,"Recently started dating a girl, I'm 20, she's ...",I'm worried I'm not as good a lover as I thoug...,sup1,,"Dating girl for about a month, self conscious ...",ref,,,1,"Dating girl for about a month, self conscious ...",0.130435,0.000000,0.130435,1.000000,1.000000,1.000000,0.004273,1.000000,94.96,59.0,51.0,5.0,187.0,176.0,0.0,43.0,94.15,13.0,11.0,1.0,51.0,49.0,0.0,9.0,0.423488,0.138889,0.011976,0.108541,0.116279,0.041322,0.527900,0.569604,0.377464,1.000001
5996,LjvoXOAj5op3WqNnn5b7TZTG8mK7gM,batch3,train,t3_3e4m5c,reddit,relationships,I [23F] found nude pics of my BF's [24M] ex gf,"As the title suggests, I found some pics I'd r...",found nude pics of my BF ex gf and other docum...,ref,,Found nude pics of my BF's ex and they are min...,sup1,,,0,found nude pics of my BF ex gf and other docum...,1.000000,1.000000,1.000000,0.260870,0.125000,0.260870,1.000000,0.080455,64.71,46.0,30.0,2.0,131.0,128.0,4.0,22.0,85.02,23.0,20.0,1.0,73.0,70.0,0.0,17.0,0.133891,0.113924,0.014870,0.076987,0.063291,0.003817,0.673475,0.519203,1.000000,0.770767
5997,LjvoXOAj5op3WqNnn5b7TZTG8mK7gM,batch3,train,t3_3e4m5c,reddit,relationships,I [23F] found nude pics of my BF's [24M] ex gf,"As the title suggests, I found some pics I'd r...",found some pics of my bf's ex &amp; his new gf...,sup1,,found nude pics of my BF ex gf and other docum...,ref,,,1,found nude pics of my BF ex gf and other docum...,0.273973,0.049383,0.246575,1.000000,1.000000,1.000000,0.023713,1.000000,95.51,60.0,54.0,3.0,204.0,191.0,0.0,48.0,64.71,46.0,30.0,2.0,131.0,128.0,4.0,22.0,0.215900,0.144509,0.044521,0.133891,0.113924,0.014870,0.607077,0.673475,0.641074,1.000000
5998,LjvoXOAj5op3WqNnn5b7TZTG8mK7gM,batch3,train,t3_3e4m5c,reddit,relationships,I [23F] found nude pics of my BF's [24M] ex gf,"As the title suggests, I found some pics I'd r...",Found nude pics of his ex gf in his computer. ...,sup1,,found some pics of my bf's ex &amp; his new gf...,sup1,,,1,found nude pics of my BF ex gf and other docum...,0.238095,0.139535,0.238095,0.273973,0.049383,0.246575,0.030262,0.023713,97.70,18.0,15.0,2.0,56.0,53.0,1.0,13.0,95.51,60.0,54.0,3.0,204.0,191.0,0.0,48.0,0.058577,0.051613,0.007812,0.215900,0.144509,0.044521,0.398618,0.607077,0.631986,0.641074


In [39]:
validation_axis

Unnamed: 0,worker,batch,split,id,source,subsource,title,text,summary,policy,note,compatible,accuracy,coverage,coherence,overall,ref_summary,m_rouge_1_f,m_rouge_2_f,m_rouge_l_f,m_bleu,m_flesch_reading_ease,m_syllable_count,m_lexicon_count,m_sentence_count,m_char_count,m_letter_count,m_polysyllab_count,m_monosyllab_count,m_compression_ratio,m_jaccard_similarity_1,m_jaccard_similarity_2,m_text_summary_xfmr_similarity,m_ref_summary_xfmr_similarity
0,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...",Fiance and I recently got infected with scabie...,sup4_ppo_rm4_t.7,'our apartment will not go away. I'm afraid he...,False,5.0,6.0,5.0,5.0,"infestation of scabies mites in apartment, roo...",0.173913,0.000000,0.130435,0.006990,87.92,46.0,35.0,4.0,150.0,144.0,3.0,27.0,0.229141,0.196429,0.085106,0.798759,0.626239
1,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...",my fiance and I refuse to treat our room mate'...,pretrain_6b_t.7,,False,4.0,4.0,7.0,4.0,"infestation of scabies mites in apartment, roo...",0.055556,0.000000,0.055556,0.007266,78.93,31.0,26.0,1.0,110.0,108.0,1.0,22.0,0.168120,0.148148,0.061453,0.554089,0.311793
2,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...","fiancé and I contracted scabies, roommate refu...",sup4_6b_ppo_rm4_6b_t.7,the question s missing but the summary is good,False,6.0,6.0,7.0,6.0,"infestation of scabies mites in apartment, roo...",0.232558,0.040000,0.186047,0.014123,70.63,50.0,35.0,2.0,158.0,155.0,4.0,25.0,0.239103,0.235849,0.123596,0.902718,0.820978
3,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...","fiance and I are infected with scabies, room m...",sup4_6b_t0.7,"a small inaccuracy and omission, otherwise good",False,6.0,6.0,7.0,6.0,"infestation of scabies mites in apartment, roo...",0.108108,0.000000,0.054054,0.009849,73.51,30.0,23.0,1.0,90.0,87.0,2.0,18.0,0.139477,0.179245,0.050000,0.808536,0.658740
4,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...","Fiance and I contracted scabies, roommate refu...",sup4_12b_t0.7,"a small inaccuracy, otherwise good",False,5.0,7.0,7.0,6.0,"infestation of scabies mites in apartment, roo...",0.190476,0.044444,0.142857,0.016276,89.45,38.0,29.0,4.0,123.0,117.0,2.0,22.0,0.188045,0.169643,0.070270,0.819988,0.669971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8580,uvzut5OK2bvei9zoCDdktcfLENYioY,tldraxis2,valid2,t3_3i230d,reddit,relationships,[Update 2] I [18 M] want to ask out a girl [18...,"[Original](\n(Clarification on this one, I did...","I want to ask out a girl on a date, general ti...",sup4_12b,•summary has added info.,True,5.0,7.0,7.0,6.0,"Girl ignored me again, I cease conversation. H...",0.216216,0.000000,0.216216,0.011833,101.60,27.0,24.0,2.0,78.0,74.0,1.0,22.0,1.097826,0.138889,0.000000,0.110270,0.312757
8581,uvzut5OK2bvei9zoCDdktcfLENYioY,tldraxis2,valid2,t3_3i230d,reddit,relationships,[Update 2] I [18 M] want to ask out a girl [18...,"[Original](\n(Clarification on this one, I did...","Girl ignored me again, I cease conversation. H...",ref,•summary is completely made up.,True,1.0,1.0,7.0,1.0,"Girl ignored me again, I cease conversation. H...",1.000000,1.000000,1.000000,1.000000,81.29,29.0,21.0,3.0,93.0,89.0,1.0,15.0,1.228261,0.111111,0.021739,0.181274,1.000000
8582,uvzut5OK2bvei9zoCDdktcfLENYioY,tldraxis2,valid2,t3_3i230d,reddit,relationships,[Update 2] I [18 M] want to ask out a girl [18...,"[Original](\n(Clarification on this one, I did...",[Update 2] I [18 M] want to ask out a girl [18...,title,•complete summary.,True,7.0,7.0,7.0,7.0,"Girl ignored me again, I cease conversation. H...",0.171429,0.000000,0.171429,0.012056,85.02,24.0,20.0,1.0,66.0,58.0,1.0,17.0,0.923913,0.205882,0.000000,0.235184,0.289666
8583,uvzut5OK2bvei9zoCDdktcfLENYioY,tldraxis2,valid2,t3_3i230d,reddit,relationships,[Update 2] I [18 M] want to ask out a girl [18...,"[Original](\n(Clarification on this one, I did...","[Original](\n(Clarification on this one, I did...",lead2,•summary is just an introduction from the orig...,True,7.0,1.0,7.0,1.0,"Girl ignored me again, I cease conversation. H...",0.181818,0.000000,0.181818,0.012857,62.68,26.0,17.0,1.0,76.0,69.0,2.0,12.0,1.000000,1.000000,1.000000,1.000000,0.181274


In [36]:
from sentence_transformers import util

#Compute embedding for both lists
embeddings1 = encoder.encode(
    train_comparisons['text'][-5:].tolist(), batch_size=64, convert_to_tensor=True, show_progress_bar=True)
embeddings2 = encoder.encode(
    train_comparisons['summary_0'][-5:].tolist(), batch_size=64, convert_to_tensor=True, show_progress_bar=True)

#Compute cosine-similarits
cosine_scores = util.cos_sim(embeddings1, embeddings2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [37]:
cosine_scores.diagonal()

tensor([0.5279, 0.6735, 0.6071, 0.3986, 0.5192], device='cuda:0')

In [38]:
train_comparisons['ref_summary'][:5]

0    mum isn't speaking to me because I booked a fl...
1    mum isn't speaking to me because I booked a fl...
2    mum isn't speaking to me because I booked a fl...
3    mum isn't speaking to me because I booked a fl...
4    landlord pretended to be another tenant and wr...
Name: ref_summary, dtype: object

In [23]:
train_comparisons['summary_0'][:5]

0    Mum is mad at me for not flying on my own trip...
1    I have made sure my mother is comfortable with...
2    mum isn't speaking to me because I booked a fl...
3    Mum thought I was going to road trip with my b...
4    My landlord is harassing me and my neighbours ...
Name: summary_0, dtype: object

In [42]:
import spacy
import pytextrank

In [47]:
import spacy
import pytextrank

# example text
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")
doc = nlp(text)

# examine the top-ranked phrases in the document
for phrase in doc._.phrases:
    print(phrase.text)
    print(phrase.rank, phrase.count)
    print(phrase.chunks)

AttributeError: module 'scipy.sparse' has no attribute 'coo_array'

In [8]:
train_comparisons.head()

Unnamed: 0,worker,batch,split,id,source,subsource,title,text,summary_0,policy_0,note_0,summary_1,policy_1,note_1,confidence,choice,ref_summary,m0_rouge_1_f,m0_rouge_2_f,m0_rouge_l_f,m1_rouge_1_f,m1_rouge_2_f,m1_rouge_l_f,m0_bleu,m1_bleu,m0_flesch_reading_ease,m0_syllable_count,m0_lexicon_count,m0_sentence_count,m0_char_count,m0_letter_count,m0_polysyllab_count,m0_monosyllab_count,m1_flesch_reading_ease,m1_syllable_count,m1_lexicon_count,m1_sentence_count,m1_char_count,m1_letter_count,m1_polysyllab_count,m1_monosyllab_count
0,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,Mum is mad at me for not flying on my own trip...,sup1,,I have made sure my mother is comfortable with...,sup1,,,1,mum isn't speaking to me because I booked a fl...,0.363636,0.181818,0.30303,0.368421,0.1,0.210526,0.057345,0.04634,97.54,17.0,16.0,1.0,53.0,52.0,0.0,15.0,71.48,32.0,25.0,1.0,102.0,101.0,2.0,20.0
1,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,I have made sure my mother is comfortable with...,sup1,,mum isn't speaking to me because I booked a fl...,ref,,,1,mum isn't speaking to me because I booked a fl...,0.368421,0.1,0.210526,1.0,1.0,1.0,0.04634,1.0,71.48,32.0,25.0,1.0,102.0,101.0,2.0,20.0,77.57,24.0,19.0,1.0,74.0,71.0,0.0,14.0
2,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,mum isn't speaking to me because I booked a fl...,ref,,Mum thought I was going to road trip with my b...,sup1,,,0,mum isn't speaking to me because I booked a fl...,1.0,1.0,1.0,0.325581,0.0,0.27907,1.0,0.008724,77.57,24.0,19.0,1.0,74.0,71.0,0.0,14.0,97.2,38.0,32.0,4.0,136.0,132.0,1.0,27.0
3,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,Mum thought I was going to road trip with my b...,sup1,,Mum is mad at me for not flying on my own trip...,sup1,,,0,mum isn't speaking to me because I booked a fl...,0.325581,0.0,0.27907,0.363636,0.181818,0.30303,0.008724,0.057345,97.2,38.0,32.0,4.0,136.0,132.0,1.0,27.0,97.54,17.0,16.0,1.0,53.0,52.0,0.0,15.0
4,LjvoXOAj5op3WqNnn5b7TZTG8mK7gM,batch3,train,t3_1zwek5,reddit,AskReddit,Can I sue my property management company and l...,My landlord left a falsified message taped to ...,My landlord is harassing me and my neighbours ...,sup1,,landlord pretended to be another tenant and wr...,ref,,,1,landlord pretended to be another tenant and wr...,0.137931,0.0,0.137931,1.0,1.0,1.0,0.007511,1.0,70.63,48.0,35.0,2.0,157.0,155.0,4.0,26.0,48.81,47.0,28.0,2.0,145.0,143.0,7.0,17.0


In [9]:
validation_axis.head()

Unnamed: 0,worker,batch,split,id,source,subsource,title,text,summary,policy,note,compatible,accuracy,coverage,coherence,overall,ref_summary,m_rouge_1_f,m_rouge_2_f,m_rouge_l_f,m_bleu,m_flesch_reading_ease,m_syllable_count,m_lexicon_count,m_sentence_count,m_char_count,m_letter_count,m_polysyllab_count,m_monosyllab_count
0,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...",Fiance and I recently got infected with scabie...,sup4_ppo_rm4_t.7,'our apartment will not go away. I'm afraid he...,False,5.0,6.0,5.0,5.0,"infestation of scabies mites in apartment, roo...",0.173913,0.0,0.130435,0.00699,87.92,46.0,35.0,4.0,150.0,144.0,3.0,27.0
1,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...",my fiance and I refuse to treat our room mate'...,pretrain_6b_t.7,,False,4.0,4.0,7.0,4.0,"infestation of scabies mites in apartment, roo...",0.055556,0.0,0.055556,0.007266,78.93,31.0,26.0,1.0,110.0,108.0,1.0,22.0
2,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...","fiancé and I contracted scabies, roommate refu...",sup4_6b_ppo_rm4_6b_t.7,the question s missing but the summary is good,False,6.0,6.0,7.0,6.0,"infestation of scabies mites in apartment, roo...",0.232558,0.04,0.186047,0.014123,70.63,50.0,35.0,2.0,158.0,155.0,4.0,25.0
3,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...","fiance and I are infected with scabies, room m...",sup4_6b_t0.7,"a small inaccuracy and omission, otherwise good",False,6.0,6.0,7.0,6.0,"infestation of scabies mites in apartment, roo...",0.108108,0.0,0.054054,0.009849,73.51,30.0,23.0,1.0,90.0,87.0,2.0,18.0
4,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...","Fiance and I contracted scabies, roommate refu...",sup4_12b_t0.7,"a small inaccuracy, otherwise good",False,5.0,7.0,7.0,6.0,"infestation of scabies mites in apartment, roo...",0.190476,0.044444,0.142857,0.016276,89.45,38.0,29.0,4.0,123.0,117.0,2.0,22.0


In [8]:
from bert_score import score

In [27]:
score(
    train_comparisons["summary_0"].iloc[:10000].tolist(),
    train_comparisons["ref_summary"].iloc[:10000].tolist(),
    lang='en', verbose=True,
)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...


AttributeError: 'float' object has no attribute 'split'

In [28]:
from evaluate import load
bertscore = load("bertscore")
bertscore.compute(
    predictions=train_comparisons["summary_0"].iloc[:10000].tolist(), 
    references=train_comparisons["ref_summary"].iloc[:10000].tolist(), 
    lang="en", 
)

RuntimeError: CUDA out of memory. Tried to allocate 56.00 MiB (GPU 0; 6.00 GiB total capacity; 5.18 GiB already allocated; 0 bytes free; 5.31 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [12]:
train_comparisons["summary_0"].iloc[:10]

0    Mum is mad at me for not flying on my own trip...
1    I have made sure my mother is comfortable with...
2    mum isn't speaking to me because I booked a fl...
3    Mum thought I was going to road trip with my b...
4    My landlord is harassing me and my neighbours ...
5    Landlord taped false message on door for every...
6    landlord pretended to be another tenant and wr...
7    landlord is a pathological liar who is now try...
8    RA owes me money, made me buy her toys in a fu...
9    RA verbally abused me and I want to get her to...
Name: summary_0, dtype: object

In [13]:
train_comparisons["ref_summary"].iloc[:10]

0    mum isn't speaking to me because I booked a fl...
1    mum isn't speaking to me because I booked a fl...
2    mum isn't speaking to me because I booked a fl...
3    mum isn't speaking to me because I booked a fl...
4    landlord pretended to be another tenant and wr...
5    landlord pretended to be another tenant and wr...
6    landlord pretended to be another tenant and wr...
7    landlord pretended to be another tenant and wr...
8    My rude RA voluntarily participated in a fundr...
9    My rude RA voluntarily participated in a fundr...
Name: ref_summary, dtype: object

In [83]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu

In [99]:
from rouge import Rouge
rouge = Rouge()

hyp_col = "summary_0"
ref_col = "ref_summary"

# Calculate BLEU scores for each row in the DataFrame
scores = []
for _, row in train_comparisons.iloc[:10000].iterrows():
    if pd.notna(row[hyp_col]) and pd.notna(row[ref_col]):
        score = sentence_bleu([row[ref_col].split()], row[hyp_col].split(), smoothing_function=SmoothingFunction().method1)
        scores.append(score)
    else:
        scores.append(None)

In [96]:
scores

[0.05734547141000556,
 0.046340057369294134,
 1.0,
 0.008724383945340666,
 0.007511251053510192,
 0.00644973654538667,
 1.0,
 0.05861484411000976,
 0.023705266435224467,
 0.015327272711566604,
 1.0,
 0.04595331932733298,
 1.0,
 0.0018635177896896655,
 0.008186841244220632,
 0.005043576168330867,
 1.0,
 0.02826536051832204,
 0.018693000799960027,
 0.024635236830568785,
 0.013679192123121896,
 0.0072658577559704465,
 1.0,
 0.011964983992380529,
 1.0,
 0,
 0.01222498616281612,
 0.0079451778602637,
 0.014761667142304912,
 0.005959978627465526,
 0.009856825562461773,
 1.0,
 0.003764359569932285,
 0.0053054184475599475,
 1.0,
 0.006244526986024011,
 1.0,
 0.008839374326825921,
 0.008071364532479952,
 0.006808256983563219,
 1.0,
 0,
 0,
 0,
 0.044430476392424055,
 0.0015433589861169757,
 0.005416537167077636,
 1.0,
 0.027948661656725198,
 1.0,
 3.197599924070407e-05,
 0.0251383253456915,
 0.0169861974906263,
 0.004620856909230222,
 0.053463162573637084,
 1.0,
 0.0041674709133537055,
 1.0,
 0.

In [70]:
row[ref_col]

"Someone alleges my GF assaulted her at a concert in Mississippi, from which she's flying back home today, which is total BS and she hit my GF first and even cop believes her. She is supposed to show up in court next week and plead her case, which is ridiculous"

In [93]:
sentence_bleu([row[ref_col].split()], row[hyp_col].split(), smoothing_function=SmoothingFunction().method1)

0.005214763215296812

In [92]:
corpus_bleu([row[ref_col].split()], [row[hyp_col].split()])

7.784451369270533e-232

In [45]:
score["rouge-1"]["f"]

TypeError: list indices must be integers or slices, not str

In [36]:
[
            score["rouge-1"]["f"] if score is not None else None for score in scores
        ]

TypeError: list indices must be integers or slices, not str

In [8]:
train_comparisons

Unnamed: 0,worker,batch,split,id,source,subsource,title,text,summary_0,policy_0,...,note_1,confidence,choice,ref_summary,m0_rouge_1_f,m0_rouge_2_f,m0_rouge_l_f,m1_rouge_1_f,m1_rouge_2_f,m1_rouge_l_f
0,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,Mum is mad at me for not flying on my own trip...,sup1,...,,,1,mum isn't speaking to me because I booked a fl...,0.363636,0.181818,0.303030,0.368421,0.100000,0.210526
1,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,I have made sure my mother is comfortable with...,sup1,...,,,1,mum isn't speaking to me because I booked a fl...,0.368421,0.100000,0.210526,1.000000,1.000000,1.000000
2,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,mum isn't speaking to me because I booked a fl...,ref,...,,,0,mum isn't speaking to me because I booked a fl...,1.000000,1.000000,1.000000,0.325581,0.000000,0.279070
3,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,Mum thought I was going to road trip with my b...,sup1,...,,,0,mum isn't speaking to me because I booked a fl...,0.325581,0.000000,0.279070,0.363636,0.181818,0.303030
4,LjvoXOAj5op3WqNnn5b7TZTG8mK7gM,batch3,train,t3_1zwek5,reddit,AskReddit,Can I sue my property management company and l...,My landlord left a falsified message taped to ...,My landlord is harassing me and my neighbours ...,sup1,...,,,1,landlord pretended to be another tenant and wr...,0.137931,0.000000,0.137931,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92816,44Z8ttpKcY6Kr1sNymNnBA0nL0h4dZ,batch9,train,t3_2n5vfw,reddit,tifu,TIFU by trying to get out of an assignment,"So.. TIFU about fifteen minutes ago, when I wa...",Thought about trying to get out of work by bre...,ref,...,ok,9.0,1,Thought about trying to get out of work by bre...,1.000000,1.000000,1.000000,0.326531,0.166667,0.285714
92817,44Z8ttpKcY6Kr1sNymNnBA0nL0h4dZ,batch9,train,t3_2n5vfw,reddit,tifu,TIFU by trying to get out of an assignment,"So.. TIFU about fifteen minutes ago, when I wa...",TIFU by accidentily spilling half a glass of w...,sup4_ppo_rm3_kl10,...,"""... my work is on Dropbox on all my spare tim...",3.0,0,Thought about trying to get out of work by bre...,0.250000,0.033333,0.142857,0.367347,0.142857,0.326531
92818,44Z8ttpKcY6Kr1sNymNnBA0nL0h4dZ,batch9,train,t3_2n5vfw,reddit,tifu,TIFU by trying to get out of an assignment,"So.. TIFU about fifteen minutes ago, when I wa...",TIFU by trying to get out of an assignment by ...,sup4_ppo_rm3_kl10,...,ok,9.0,1,Thought about trying to get out of work by bre...,0.367347,0.142857,0.326531,0.326531,0.166667,0.285714
92819,44Z8ttpKcY6Kr1sNymNnBA0nL0h4dZ,batch9,train,t3_2n5vfw,reddit,tifu,TIFU by trying to get out of an assignment,"So.. TIFU about fifteen minutes ago, when I wa...",Thought about trying to get out of work by bre...,ref,...,strange,7.0,1,Thought about trying to get out of work by bre...,1.000000,1.000000,1.000000,0.250000,0.033333,0.142857


In [8]:
test_axis

Unnamed: 0,worker,batch,split,id,source,subsource,title,text,summary,policy,note,compatible,accuracy,coverage,coherence,overall,ref_summary
0,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,cnndm1,test,167f80cc6634b166a699d182e25c81a2349d82d2,cnn_dailymail,dailymail,Newcastle United midfielder Moussa Sissoko fac...,Newcastle stand-in skipper Moussa Sissoko is f...,Moussa Sissoko was sent off against Liverpool ...,ref,"Misleading: ""Carver admits he is only concerne...",False,5,4,2.0,3,Moussa Sissoko was sent off against Liverpool ...
1,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,cnndm1,test,167f80cc6634b166a699d182e25c81a2349d82d2,cnn_dailymail,dailymail,Newcastle United midfielder Moussa Sissoko fac...,Newcastle stand-in skipper Moussa Sissoko is f...,Newcastle have a mountain to climb in the Prem...,sup4_t0.7,"""Carver is determined to make the most of his ...",False,3,1,1.0,1,Moussa Sissoko was sent off against Liverpool ...
2,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,cnndm1,test,167f80cc6634b166a699d182e25c81a2349d82d2,cnn_dailymail,dailymail,Newcastle United midfielder Moussa Sissoko fac...,Newcastle stand-in skipper Moussa Sissoko is f...,Moussa Sissoko has been sent off following a d...,sup4_ppo_rm4_t.7,"""Carver feels the player should have been sent...",False,4,2,2.0,2,Moussa Sissoko was sent off against Liverpool ...
3,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,cnndm1,test,167f80cc6634b166a699d182e25c81a2349d82d2,cnn_dailymail,dailymail,Newcastle United midfielder Moussa Sissoko fac...,Newcastle stand-in skipper Moussa Sissoko is f...,Moussa Sissoko is facing a disciplinary action...,pretrain_xl_t.7,Doesnt summarize important points. Match info ...,False,7,3,6.0,3,Moussa Sissoko was sent off against Liverpool ...
4,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,cnndm1,test,167f80cc6634b166a699d182e25c81a2349d82d2,cnn_dailymail,dailymail,Newcastle United midfielder Moussa Sissoko fac...,Newcastle stand-in skipper Moussa Sissoko is f...,Newcastle need to start helping themselves now...,sup4_6b_t0.7,Completely misses main info.,False,7,1,3.0,1,Moussa Sissoko was sent off against Liverpool ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6286,uvzut5OK2bvei9zoCDdktcfLENYioY,cnndm4,test,843f85685d2ca5bab950e30d6cd89e91bc539018,cnn_dailymail,dailymail,Lamb born twice the normal size weighing 20lbs...,This bundle of joy was a special spring surpri...,This bundle of joy was a special spring surpri...,lead3,•great summary with elaborate detail.,True,7,7,7.0,7,Farmers named him Big Ben as he dwarfs other 8...
6287,uvzut5OK2bvei9zoCDdktcfLENYioY,cnndm4,test,843f85685d2ca5bab950e30d6cd89e91bc539018,cnn_dailymail,dailymail,Lamb born twice the normal size weighing 20lbs...,This bundle of joy was a special spring surpri...,Big Ben is twice the weight of the average new...,supcnndm1_6b,•[He is the biggest lamb ever born in the UK.]...,True,4,5,7.0,4,Farmers named him Big Ben as he dwarfs other 8...
6288,uvzut5OK2bvei9zoCDdktcfLENYioY,cnndm4,test,843f85685d2ca5bab950e30d6cd89e91bc539018,cnn_dailymail,dailymail,Lamb born twice the normal size weighing 20lbs...,This bundle of joy was a special spring surpri...,Shepherd John Hendy and team of three helpers ...,supcnndm3_6b,•sufficient key information but minimal details.,True,7,5,7.0,4,Farmers named him Big Ben as he dwarfs other 8...
6289,uvzut5OK2bvei9zoCDdktcfLENYioY,cnndm4,test,843f85685d2ca5bab950e30d6cd89e91bc539018,cnn_dailymail,dailymail,Lamb born twice the normal size weighing 20lbs...,This bundle of joy was a special spring surpri...,This bundle of joy was a special spring surpri...,pretrain_xl,•good summary with average detail.,True,7,6,7.0,6,Farmers named him Big Ben as he dwarfs other 8...


In [9]:
validation_axis

Unnamed: 0,worker,batch,split,id,source,subsource,title,text,summary,policy,note,compatible,accuracy,coverage,coherence,overall,ref_summary
0,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...",Fiance and I recently got infected with scabie...,sup4_ppo_rm4_t.7,'our apartment will not go away. I'm afraid he...,False,5.0,6.0,5.0,5.0,"infestation of scabies mites in apartment, roo..."
1,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...",my fiance and I refuse to treat our room mate'...,pretrain_6b_t.7,,False,4.0,4.0,7.0,4.0,"infestation of scabies mites in apartment, roo..."
2,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...","fiancé and I contracted scabies, roommate refu...",sup4_6b_ppo_rm4_6b_t.7,the question s missing but the summary is good,False,6.0,6.0,7.0,6.0,"infestation of scabies mites in apartment, roo..."
3,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...","fiance and I are infected with scabies, room m...",sup4_6b_t0.7,"a small inaccuracy and omission, otherwise good",False,6.0,6.0,7.0,6.0,"infestation of scabies mites in apartment, roo..."
4,iL7GfrbN2PeB3KInidqSxUdxYcTZmG,tldraxis1,valid2,t3_4l0bal,reddit,relationship_advice,19f with fiance 20m and roommate 19m- fiance a...,"Recently, my fiance (20 m) and I (19f) moved ...","Fiance and I contracted scabies, roommate refu...",sup4_12b_t0.7,"a small inaccuracy, otherwise good",False,5.0,7.0,7.0,6.0,"infestation of scabies mites in apartment, roo..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8580,uvzut5OK2bvei9zoCDdktcfLENYioY,tldraxis2,valid2,t3_3i230d,reddit,relationships,[Update 2] I [18 M] want to ask out a girl [18...,"[Original](\n(Clarification on this one, I did...","I want to ask out a girl on a date, general ti...",sup4_12b,•summary has added info.,True,5.0,7.0,7.0,6.0,"Girl ignored me again, I cease conversation. H..."
8581,uvzut5OK2bvei9zoCDdktcfLENYioY,tldraxis2,valid2,t3_3i230d,reddit,relationships,[Update 2] I [18 M] want to ask out a girl [18...,"[Original](\n(Clarification on this one, I did...","Girl ignored me again, I cease conversation. H...",ref,•summary is completely made up.,True,1.0,1.0,7.0,1.0,"Girl ignored me again, I cease conversation. H..."
8582,uvzut5OK2bvei9zoCDdktcfLENYioY,tldraxis2,valid2,t3_3i230d,reddit,relationships,[Update 2] I [18 M] want to ask out a girl [18...,"[Original](\n(Clarification on this one, I did...",[Update 2] I [18 M] want to ask out a girl [18...,title,•complete summary.,True,7.0,7.0,7.0,7.0,"Girl ignored me again, I cease conversation. H..."
8583,uvzut5OK2bvei9zoCDdktcfLENYioY,tldraxis2,valid2,t3_3i230d,reddit,relationships,[Update 2] I [18 M] want to ask out a girl [18...,"[Original](\n(Clarification on this one, I did...","[Original](\n(Clarification on this one, I did...",lead2,•summary is just an introduction from the orig...,True,7.0,1.0,7.0,1.0,"Girl ignored me again, I cease conversation. H..."


In [42]:
from rouge import Rouge

def compute_rouge_scores(data: pd.DataFrame, hyp_col: str, ref_col: str, name: str) -> pd.DataFrame:
    """
    Compute ROUGE score.
    
    """
    
    df = data.copy()
    
    # Initialize ROUGE scorer
    rouge = Rouge()
    
    # Calculate ROUGE scores for each row in the DataFrame
    scores = []
    for _, row in df.iterrows():
        if pd.notna(row[hyp_col]) and pd.notna(row[ref_col]):
            score = rouge.get_scores(row[hyp_col], row[ref_col], avg=True)
            scores.append(score)
        else:
            scores.append(None)

    # Extract ROUGE-1, ROUGE-2, and ROUGE-L scores
    df.loc[:, f"{name}_rouge_1_f"] = [
        score["rouge-1"]["f"] if score is not None else None for score in scores
    ]
    df.loc[:, f"{name}_rouge_2_f"] = [
        score["rouge-2"]["f"] if score is not None else None for score in scores
    ]
    df.loc[:, f"{name}_rouge_l_f"] = [
        score["rouge-l"]["f"] if score is not None else None for score in scores
    ]

    return df

In [46]:
# Initialize ROUGE scorer
rouge = Rouge()

In [47]:
rouge.get_scores(
    train_comparisons.summary_1[0], train_comparisons.ref_summary[0]
)

[{'rouge-1': {'r': 0.3888888888888889, 'p': 0.35, 'f': 0.3684210476454294},
  'rouge-2': {'r': 0.1111111111111111,
   'p': 0.09090909090909091,
   'f': 0.09999999505000023},
  'rouge-l': {'r': 0.2222222222222222, 'p': 0.2, 'f': 0.21052631080332423}}]

In [45]:
train_comparisons.summary_1[0]

'I have made sure my mother is comfortable with my boyfriend travelling on a trip and now my mother is mad because I booked it.'

In [48]:
compute_rouge_scores(train_comparisons.iloc[:4000], 'summary_1', 'ref_summary', 'm1')

Unnamed: 0,worker,batch,split,id,source,subsource,title,text,summary_0,policy_0,note_0,summary_1,policy_1,note_1,confidence,choice,ref_summary,m1_rouge_1_f,m1_rouge_2_f,m1_rouge_l_f
0,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,Mum is mad at me for not flying on my own trip...,sup1,,I have made sure my mother is comfortable with...,sup1,,,1,mum isn't speaking to me because I booked a fl...,0.368421,0.100000,0.210526
1,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,I have made sure my mother is comfortable with...,sup1,,mum isn't speaking to me because I booked a fl...,ref,,,1,mum isn't speaking to me because I booked a fl...,1.000000,1.000000,1.000000
2,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,mum isn't speaking to me because I booked a fl...,ref,,Mum thought I was going to road trip with my b...,sup1,,,0,mum isn't speaking to me because I booked a fl...,0.325581,0.000000,0.279070
3,qo6WIyEh27cwAjWpA3Q60J7NaDxzQJ,batch3,train,t3_34xale,reddit,relationships,Mother not speaking to me because of a trip ...,My boyfriend and I are long distance. We have ...,Mum thought I was going to road trip with my b...,sup1,,Mum is mad at me for not flying on my own trip...,sup1,,,0,mum isn't speaking to me because I booked a fl...,0.363636,0.181818,0.303030
4,LjvoXOAj5op3WqNnn5b7TZTG8mK7gM,batch3,train,t3_1zwek5,reddit,AskReddit,Can I sue my property management company and l...,My landlord left a falsified message taped to ...,My landlord is harassing me and my neighbours ...,sup1,,landlord pretended to be another tenant and wr...,ref,,,1,landlord pretended to be another tenant and wr...,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,RgH765FRFOQZNXPAK7ZzTlIAnj8UD2,batch3,train,t3_lg5fp,reddit,AskReddit,"Calling all therapists, care workers, psycholo...",I am about to start some voluntary work at an ...,Looking for advice and links on working with p...,sup1,,Need advice on what I should do to better help...,sup1,,,0,Starting voluntary work at art centre for peop...,0.375000,0.266667,0.312500
3996,RgH765FRFOQZNXPAK7ZzTlIAnj8UD2,batch3,train,t3_lg5fp,reddit,AskReddit,"Calling all therapists, care workers, psycholo...",I am about to start some voluntary work at an ...,Starting voluntary work at art centre for peop...,ref,,Basically anyone who works in a group setting ...,sup1,,,0,Starting voluntary work at art centre for peop...,0.344828,0.142857,0.275862
3997,RgH765FRFOQZNXPAK7ZzTlIAnj8UD2,batch3,train,t3_lg5fp,reddit,AskReddit,"Calling all therapists, care workers, psycholo...",I am about to start some voluntary work at an ...,Basically anyone who works in a group setting ...,sup1,,Looking for advice and links on working with p...,sup1,,,1,Starting voluntary work at art centre for peop...,0.538462,0.320000,0.461538
3998,RgH765FRFOQZNXPAK7ZzTlIAnj8UD2,batch3,train,t3_2tlc7y,reddit,relationships,I [23 M] am really happy with my long distance...,"I've been learning Spanish on my own, and, in ...","I'm really into a girl, but have no idea what ...",sup1,,Met girl online. Never met in real life. Super...,ref,,,1,Met girl online. Never met in real life. Super...,1.000000,1.000000,1.000000


In [50]:
start_time = time.time()
train_comparisons[:4000].apply(lambda x: rouge.get_scores(x['summary_0'], x['ref_summary']), axis=1)
print("--- %s seconds ---" % (time.time() - start_time))

AttributeError: 'float' object has no attribute 'split'

In [400]:
train_comparisons[:1].apply(lambda x: rouge.get_scores(x['summary_0'], x['ref_summary']), axis=1)[0]

[{'rouge-1': {'r': 0.3333333333333333, 'p': 0.4, 'f': 0.36363635867768596},
  'rouge-2': {'r': 0.16666666666666666, 'p': 0.2, 'f': 0.18181817685950424},
  'rouge-l': {'r': 0.2777777777777778,
   'p': 0.3333333333333333,
   'f': 0.30303029807162535}}]

In [362]:
train_comparisons[['summary_0', 'ref_summary']]

Unnamed: 0,summary_0,ref_summary
0,Mum is mad at me for not flying on my own trip...,mum isn't speaking to me because I booked a fl...
1,I have made sure my mother is comfortable with...,mum isn't speaking to me because I booked a fl...
2,mum isn't speaking to me because I booked a fl...,mum isn't speaking to me because I booked a fl...
3,Mum thought I was going to road trip with my b...,mum isn't speaking to me because I booked a fl...
4,My landlord is harassing me and my neighbours ...,landlord pretended to be another tenant and wr...
...,...,...
92820,Thought about trying to get out of work by bre...,Thought about trying to get out of work by bre...
92821,TIFU by accidentily spilling half a glass of w...,Thought about trying to get out of work by bre...
92822,TIFU by trying to get out of an assignment by ...,Thought about trying to get out of work by bre...
92823,Thought about trying to get out of work by bre...,Thought about trying to get out of work by bre...


In [None]:
# perplexity
# BLEU

In [41]:
for i, text in train_comparisons.groupby('text'):
    text

In [47]:
text

Unnamed: 0,worker,batch,split,id,source,subsource,title,text,summary_0,policy_0,note_0,summary_1,policy_1,note_1,confidence,choice
69096,ZzGCcAhvqF0HnKxNsUjtJFadcZdyZj,batch6,train,t3_3rj2k6,reddit,relationship_advice,"I (F29) own my own business. This is great, ex...",x-posted to /r/relationships\n\nI started my o...,"Owns company, makes a lot of money, great cow...",sup2_bo8_rm1,Quoted line is quite unclear.,I'm a woman over 30 who makes more than her m...,sup3_6b,What does the author want to get fixed?,6.0,1
69097,ZzGCcAhvqF0HnKxNsUjtJFadcZdyZj,batch6,train,t3_3rj2k6,reddit,relationship_advice,"I (F29) own my own business. This is great, ex...",x-posted to /r/relationships\n\nI started my o...,I'm a woman over 30 who makes more than her m...,sup3_6b,What does the author want to get fixed?,I (F29) own my own business and do very well ...,ref,OK,9.0,1
69098,ZzGCcAhvqF0HnKxNsUjtJFadcZdyZj,batch6,train,t3_3rj2k6,reddit,relationship_advice,"I (F29) own my own business. This is great, ex...",x-posted to /r/relationships\n\nI started my o...,"I make a lot of money, but I can't get guys t...",sup2,OK,I'm a woman over 30 who makes more than her m...,sup3_6b,What does the author want to get fixed?,4.0,0
69099,ZzGCcAhvqF0HnKxNsUjtJFadcZdyZj,batch6,train,t3_3rj2k6,reddit,relationship_advice,"I (F29) own my own business. This is great, ex...",x-posted to /r/relationships\n\nI started my o...,"I make a lot of money, but I can't get guys t...",sup2,OK,I (F29) own my own business and do very well ...,ref,OK,9.0,1
69100,ZzGCcAhvqF0HnKxNsUjtJFadcZdyZj,batch6,train,t3_3rj2k6,reddit,relationship_advice,"I (F29) own my own business. This is great, ex...",x-posted to /r/relationships\n\nI started my o...,"I make a lot of money, but I can't get guys t...",sup2,OK,"Owns company, makes a lot of money, great cow...",sup2_bo8_rm1,Quoted line is quite unclear.,4.0,0
69101,ZzGCcAhvqF0HnKxNsUjtJFadcZdyZj,batch6,train,t3_3rj2k6,reddit,relationship_advice,"I (F29) own my own business. This is great, ex...",x-posted to /r/relationships\n\nI started my o...,"Owns company, makes a lot of money, great cow...",sup2_bo8_rm1,Quoted line is quite unclear.,I (F29) own my own business and do very well ...,ref,OK,9.0,1
