# Task 1 ( Fine-Tuning CNN Dailymail Dataset )

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/THExt-ensemble-main

In [None]:
!pip install -r requirements.txt

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
from finetuning import finetuning
import pandas as pd
import rouge
import numpy as np
from datasets import load_dataset
from Thext import SentenceRankerPlus
from Thext import Highlighter
from Thext import RedundancyManager
import nltk
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
finetuning("dataset_fine_tuning_THExt.csv", "checkpoint")

Evaluate

In [7]:
def evaluate(text, hs,  sentences):

    predicted_highlights_concat = ' '.join(map(str, sentences))
    real_highlights_concat =  hs

    r_computer = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], limit_length=False, max_n=2, alpha=0.5, stemming=False)
    score = r_computer.get_scores(predicted_highlights_concat,real_highlights_concat) 

    return score['rouge-1']['f'],score['rouge-2']['f'], score['rouge-l']['f']

In [None]:
data = load_dataset("cnn_dailymail" ,"3.0.0", split="validation")
data = pd.DataFrame(data).iloc[:1000]

In [9]:
model_name_or_path='checkpoint' 
base_model_name = "morenolq/thext-cs-scibert"

In [None]:
sr = SentenceRankerPlus(device='cuda')
sr.load_model(base_model_name=base_model_name, model_name_or_path=model_name_or_path,device='cuda')
rm = RedundancyManager()
h = Highlighter(sr, redundancy_manager = rm)

In [11]:
r1_f = np.array([])
r2_f = np.array([])
rl_f = np.array([])

In [None]:
for i in range(len(data)):
  text = data.iloc[i]['article']
  highlights = data.iloc[i]['highlights']

  sum = h.get_highlights_simple(text, abstract = True, rel_w=1.0, pos_w=0.0, red_w=0.0, prefilter=False, NH = 3)


  r1f,r2f,rlf = evaluate(text, highlights, sentences = sum)

  r1_f = np.append(r1_f,r1f)
  r2_f = np.append(r2_f,r2f)
  rl_f = np.append(rl_f,rlf)

In [None]:
print(f"Avarage Rougue-1 f1 score : { np.average(r1_f) }")
print(f"Avarage Rougue-2 f1 score : { np.average(r2_f) }")
print(f"Avarage Rougue-l f1 score : { np.average(rl_f) }")

# Task 2 ( Ensamble method )

In [14]:
from datasets import load_dataset
import pandas as pd
from utils import Ensemble
from datasets import load_dataset
import rouge
import pandas as pd
import numpy as np

Fit models

In [None]:
from sklearn.linear_model import LassoCV

random_ensamble = Ensemble("RandomForest")
sgd_ensamble = Ensemble("sgd")
lasso_ensamble = Ensemble(model = LassoCV(cv=5, random_state=0))

data = pd.read_csv("data_train.csv") #specificare nel readme di scaricare il dataset dal drive e metterlo nella cartella

X = data[['text_rank', 'lsa_score', 'tf_idf', 'relevance_score', 'thext_score', 'pos_i']]
y = data['rouge_2f']

random_ensamble.train(X,y)
sgd_ensamble.train(X,y)
lasso_ensamble.train(X,y)

random_ensamble.save("random")
sgd_ensamble.save("sgd")
lasso_ensamble.save("lasso")



Test models

In [16]:
#random_ensamble.load("random")
#sgd_ensamble.load("sgd")
#lasso_ensamble.load("lasso")

data = load_dataset("cnn_dailymail" ,"3.0.0", split="validation")
data = pd.DataFrame(data).iloc[:1000]



In [17]:
def evaluate(model, data):
	r1_f = []
	r2_f = []
	rl_f = []
	for i in range(len(data)):
		text = data.iloc[i]['article']
		highlights = data.iloc[i]['highlights']
		r1f,r2f,rlf = model.evaluate(text, highlights)
		r1_f.append(r1f)
		r2_f.append(r2f)
		rl_f.append(rlf)
	print(f"Avarage Rougue-1 f1 score : { np.average(r1_f) }")
	print(f"Avarage Rougue-2 f1 score : { np.average(r2_f) }")
	print(f"Avarage Rougue-l f1 score : { np.average(rl_f) }")

In [None]:
evaluate(random_ensamble, data)

In [None]:
evaluate(sgd_ensamble, data)

In [None]:
evaluate(lasso_ensamble, data)

Independent set method

In [None]:
from utils import RedundancyIndipendentSet

In [None]:
random_ensamble.load("random")

data = load_dataset("cnn_dailymail" ,"3.0.0", split="validation")
data = pd.DataFrame(data).iloc[:1000]

In [None]:
r = RedundancyIndipendentSet()

r1_f = np.array([])
r2_f = np.array([])
rl_f = np.array([])
r1_f_ind = np.array([])
r2_f_ind = np.array([])
rl_f_ind = np.array([])

In [None]:
for i in range(len(data)):

  text = data.iloc[i]['article']
  highlights = data.iloc[i]['highlights']

  sum = random_ensamble.summary(text, NH = 3)
  sum_redundancy = random_ensamble.summary(text, NH = 5, score=True)
  ind = r.indipendent_set(sum_redundancy)

  r1f_ind,r2f_ind,rlf_ind = random_ensamble.evaluate(text, highlights, sent = ind )
  r1f,r2f,rlf = random_ensamble.evaluate(text, highlights, sent = sum)

  r1_f = np.append(r1_f,r1f)
  r2_f = np.append(r2_f,r2f)
  rl_f = np.append(rl_f,rlf)
  r1_f_ind = np.append(r1_f_ind,r1f_ind)
  r2_f_ind = np.append(r2_f_ind,r2f_ind)
  rl_f_ind = np.append(rl_f_ind,rlf_ind)

In [None]:
print(f"\n\nAvarage Rougue-1 f1 score : { np.average(r1_f) }")
print(f"Avarage Rougue-2 f1 score : { np.average(r2_f) }")
print(f"Avarage Rougue-l f1 score : { np.average(rl_f) }")
print(f"\n\nAvarage Rougue-1 f1 score indipendence method : { np.average(r1_f_ind) }")
print(f"Avarage Rougue-2 f1 score  indipendence method : { np.average(r2_f_ind) }")
print(f"Avarage Rougue-l f1 score indipendence method : { np.average(rl_f_ind) }")