In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import spacy
nlp = spacy.load("zh_core_web_sm") # to split texts into sentences
import re
import os
import json

In [None]:
from fastHan import FastHan # to split sentences into words
model = FastHan(model_type = "large")

In [None]:
model.set_cws_style('ctb')

In [None]:
PATH = ""
SUMMPATH = ""

In [None]:
# to get the texts
texts_pathes = os.listdir(PATH)
texts_pathes = sorted(texts_pathes)
# print(texts_pathes)
texts = []
for text in texts_pathes:
  text = open(os.path.join(PATH, text),"r", encoding = "utf-8")
  text = text.read()
  texts.append(text)

In [None]:
# to get the summaries
summ_paths = os.listdir(SUMMPATH)
summ_paths = sorted(summ_paths)
# print(summ_paths)
summaries = []
for summary in summ_paths:
  summary = open(os.path.join(SUMMPATH, summary),"r", encoding = "utf-8")
  summary = summary.read()
  summaries.append(summary)

In [None]:
!pip install rouge-metric

In [None]:
from rouge_metric import PyRouge

In [None]:
rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=True,
                rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4)

In [None]:
# to help ROUGE
def preprocess_tokenize(text):
  sentences = list(map(lambda sent: sent + '。', text.split('。')))[:-1]
  tokenized_sentences = list(map(lambda sent: " ".join(model(sent, target = "CWS")[0]), sentences))
  tokenized_sentence = " ".join(tokenized_sentences)
  return tokenized_sentence

In [None]:
def evaluate_rouge(summary, prediction):
  # tokenized_summary = model(summary, target = "CWS")
  # tokenized_summary = " ".join(tokenized_summary[0])
  tokenized_summary = preprocess_tokenize(summary)
  score = rouge.evaluate([tokenized_summary], [[prediction]])
  return score

In [None]:
# to evaluate summaries
def evaluate_predictions(summaries, predictions):
  rouge_scores = []
  for summary, prediction in zip(summaries, predictions):
    score = evaluate_rouge(summary, prediction)
    rouge_scores.append(score)
  return rouge_scores

In [None]:
# to find Recall, Precision and F-score
def avg_score(score_dicts_list):
  avg_score = {}
  for key in score_dicts_list[0].keys():
    avg_score[key] = {}
    avg_score[key]['f'] = sum([rouge_score[key]['f'] for rouge_score in score_dicts_list]) / len(score_dicts_list)
    avg_score[key]['p'] = sum([rouge_score[key]['p'] for rouge_score in score_dicts_list]) / len(score_dicts_list)
    avg_score[key]['r'] = sum([rouge_score[key]['r'] for rouge_score in score_dicts_list]) / len(score_dicts_list)
  return avg_score

In [None]:
# to evaluate TextRank4Zh
!pip3 install textrank4zh

In [None]:
from textrank4zh import TextRank4Sentence

In [None]:
# to make a summary of TextRank4Zh more similar to standard summary
def post_processed__TextRank4zh_text(text, n):
  tr4s = TextRank4Sentence()
  tr4s.analyze(text)
  summary = tr4s.get_key_sentences(num=n)

  tokenized_sentences = []
  sentences = [item.sentence for item in summary]
  for sentence in sentences:
    sentence = sentence.strip("\n")
    sentence = model(sentence, target = "CWS")
    sentence = " ".join(sentence[0])
    #print(sentence)
    tokenized_sentences.append(sentence)
  #print(len(tokenized_sentences))
  whole_text = "。".join(tokenized_sentences)  
  whole_text += "。"
  return whole_text

In [None]:
# to get a summary from TextRank4Zh
TextRank4Zh_summaries = []
for text in texts:
  new_text = post_processed__TextRank4zh_text(text, 8)
  TextRank4Zh_summaries.append(new_text)

In [None]:
textrank4zh_rouge_scores = evaluate_predictions(summaries, TextRank4Zh_summaries)

In [None]:
textrank4zh_avg_scores = avg_score(textrank4zh_rouge_scores)

In [None]:
with open('/content/rouge_results/textrank4zh_avg_scores.json', 'w') as res_json:
  json.dump(textrank4zh_avg_scores, res_json)

In [None]:
# to evaluate Macropodus
!pip install macropodus

In [None]:
import macropodus

In [None]:
# to make a summary of Macropodus more similar to standard summary
def post_processed__Macropodus_text(text, n, sum_type, model_type, type_l):
  sentences = macropodus.summarization(text = text, type_summarize = sum_type, num = n, model_type = model_type, type_l = type_l)

  tokenized_sentences = []
  for sentence in sentences:
    sentence = sentence[1].strip("\n")
    sentence = model(sentence, target = "CWS")
    sentence = " ".join(sentence[0])
    #print(sentence)
    tokenized_sentences.append(sentence)

  whole_text = "。".join(tokenized_sentences)  
  whole_text += "。"
  return whole_text

In [None]:
# to get a summary from Macropodus//lda
Macropodus_LDA_summaries = []
for new_text in texts:
  new_text = post_processed__Macropodus_text(text, 8, "lda", model_type = None, type_l = None)
  Macropodus_LDA_summaries.append(new_text)

In [None]:
macropodus_lda_rouge_scores = evaluate_predictions(summaries, Macropodus_LDA_summaries)

In [None]:
macropodus_lda_avg_scores = avg_score(macropodus_lda_rouge_scores)

In [None]:
with open('/content/rouge_results/macropodus_lda_avg_scores.json', 'w') as res_json:
  json.dump(macropodus_lda_avg_scores, res_json)

In [None]:
# to get a summary from Macropodus//pronouns
Macropodus_pronouns_summaries = []
for text in texts:
  text = post_processed__Macropodus_text(text, 8, "text_pronouns", model_type = None, type_l = None)
  Macropodus_pronouns_summaries.append(text)

In [None]:
macropodus_pronouns_rouge_scores = evaluate_predictions(summaries, Macropodus_pronouns_summaries)

In [None]:
macropodus_pronouns_avg_scores = avg_score(macropodus_pronouns_rouge_scores)

In [None]:
with open('/content/rouge_results/macropodus_pronouns_avg_scores.json', 'w') as res_json:
  json.dump(macropodus_pronouns_avg_scores, res_json)

In [None]:
# to get a summary from Macropodus//text_teaser
Macropodus_text_teaser_summaries = []
for text in texts:
  text = post_processed__Macropodus_text(text, 8, "text_teaser", model_type = "sklearn", type_l = None)
  Macropodus_text_teaser_summaries.append(text)

In [None]:
macropodus_text_teaser_rouge_scores = evaluate_predictions(summaries, Macropodus_text_teaser_summaries)

In [None]:
macropodus_text_teaser_avg_scores = avg_score(macropodus_text_teaser_rouge_scores)

In [None]:
with open('/content/rouge_results/macropodus_text_teaser_avg_scores.json', 'w') as res_json:
  json.dump(macropodus_text_teaser_avg_scores, res_json)

In [None]:
# to get a summary from Macropodus//word_sign
Macropodus_word_sign_summaries = []
for text in texts:
  text = post_processed__Macropodus_text(text, 8, "word_sign", model_type = None, type_l = None)
  Macropodus_word_sign_summaries.append(text)

In [None]:
macropodus_word_sign_rouge_scores = evaluate_predictions(summaries, Macropodus_word_sign_summaries)

In [None]:
macropodus_word_sign_avg_scores = avg_score(macropodus_word_sign_rouge_scores)

In [None]:
with open('/content/rouge_results/macropodus_word_sign_avg_scores.json', 'w') as res_json:
  json.dump(macropodus_word_sign_avg_scores, res_json)

In [None]:
# to get a summary from Macropodus//mmr
Macropodus_mmr_summaries = []
for text in texts:
  text = post_processed__Macropodus_text(text, 8, "mmr", model_type = None, type_l = None)
  Macropodus_mmr_summaries.append(text)

In [None]:
macropodus_mmr_rouge_scores = evaluate_predictions(summaries, Macropodus_mmr_summaries)

In [None]:
macropodus_mmr_avg_scores = avg_score(macropodus_mmr_rouge_scores)

In [None]:
with open('/content/rouge_results/macropodus_mmr_avg_scores.json', 'w') as res_json:
  json.dump(macropodus_mmr_avg_scores, res_json)

In [None]:
# to get a summary from Macropodus//lsi
Macropodus_lsi_summaries = []
for text in texts:
  text = post_processed__Macropodus_text(text, 8, "lsi", model_type = None, type_l = None)
  Macropodus_lsi_summaries.append(text)

In [None]:
macropodus_lsi_rouge_scores = evaluate_predictions(summaries, Macropodus_lsi_summaries)

In [None]:
macropodus_lsi_avg_scores = avg_score(macropodus_lsi_rouge_scores)

In [None]:
with open('/content/rouge_results/macropodus_lsi_avg_scores.json', 'w') as res_json:
  json.dump(macropodus_lsi_avg_scores, res_json)

In [None]:
# to get a summary from Macropodus//nmf
Macropodus_nmf_summaries = []
for text in texts:
  text = post_processed__Macropodus_text(text, 8, "nmf", model_type = None, type_l = None)
  Macropodus_nmf_summaries.append(text)

In [None]:
macropodus_nmf_rouge_scores = evaluate_predictions(summaries, Macropodus_nmf_summaries)

In [None]:
macropodus_nmf_avg_scores = avg_score(macropodus_nmf_rouge_scores)

In [None]:
with open('/content/rouge_results/macropodus_nmf_avg_scores.json', 'w') as res_json:
  json.dump(macropodus_nmf_avg_scores, res_json)

In [None]:
# to evaluate SnowNLP
!pip install snownlp

In [None]:
from snownlp import SnowNLP

In [None]:
# to make a summary of SnowNLP more similar to standard summary
def post_processed_SnowNLP_text(text, n):
  sentences = SnowNLP(text)
  sentences = sentences.summary(n)

  tokenized_sentences = []
  for sentence in sentences:
    sentence = sentence.strip("\n")
    sentence = model(sentence, target = "CWS")
    sentence = " ".join(sentence[0])
    #print(sentence)
    tokenized_sentences.append(sentence)

  whole_text = "。".join(tokenized_sentences)  
  whole_text += "。"
  return whole_text  

In [None]:
# to get a summary from SnowNLP
SnowNLP_summaries = []
for text in texts:
  new_text = post_processed_SnowNLP_text(text, 8)
  SnowNLP_summaries.append(new_text)

In [None]:
snownlp_rouge_scores = evaluate_predictions(summaries, SnowNLP_summaries)

In [None]:
snownlp_avg_scores = avg_score(snownlp_rouge_scores)

In [None]:
with open('/content/rouge_results/snownlp_avg_scores.json', 'w') as res_json:
  json.dump(snownlp_avg_scores, res_json)

In [None]:
# to evaluate Lead-3
Lead3_baselines = []
for text in texts:
  new_text = post_processed__Macropodus_text(text, 8, "lead3", model_type = None, type_l ="mix")
  Lead3_baselines.append(new_text)
print(Lead3_baselines)

In [None]:
lead3_rouge_scores = evaluate_predictions(summaries, Lead3_baselines)

In [None]:
lead3_avg_scores = avg_score(lead3_rouge_scores)

In [None]:
with open('/content/rouge_results/lead3_avg_scores.json', 'w') as res_json:
  json.dump(lead3_avg_scores, res_json)

# OpenNMT-py

In [None]:
# to evaluate OpenNMT-py
!pip install git+https://github.com/OpenNMT/OpenNMT-py@585499a450

In [None]:
def preprocess_opennmt(text):
  processed_text = re.sub("\n", " ", text)
  processed_text = ' '.join(list(processed_text))
  return processed_text

In [None]:
processed_for_onmt = []
for text in texts:
  processed_text = preprocess_opennmt(text)
  processed_for_onmt.append(processed_text)

In [None]:
for index, text in enumerate(processed_for_onmt):
  with open(os.path.join(ONMT_PATH, str(index)) + '.txt', "w") as f:
    f.write(text)

In [None]:
# to apply onmt model for all preprocessed texts
for index, text in enumerate(os.listdir(ONMT_PATH)):
  !onmt_translate -model /content/drive/MyDrive/магистратура/db_chinese_text_summarization/chinese_opennmt_lcsts_acc_56.86_ppl_10.97_e11.pt -src /content/onmt_data/{text} -output /content/onmt_data/summaries/{index}_summary.txt -verbose

In [None]:
onmt_preds = []
for summary in os.listdir(ONMT_SUMMARIES_PATH):
  with open(os.path.join(ONMT_SUMMARIES_PATH, summary), "r") as pred:
    opennmt_results = pred.read()
    onmt_preds.append(opennmt_results)

In [None]:
# to make a summary of Open-NMT more similar to standard summary
def post_process_onmt(text):
  text = text.strip("\n")
  text = model(text, target = "CWS")
  text = " ".join(text[0])
  #print(sentence) 
  text += "。"
  return text

In [None]:
# to get a summary from Open-NMT
onmt_summaries =[]
for pred in onmt_preds:
  processed = post_process_onmt(pred)
  onmt_summaries.append(processed)

In [None]:
onmt_rouge_scores = evaluate_predictions(summaries, onmt_summaries)

In [None]:
onmt_avg_scores = avg_score(onmt_rouge_scores)

In [None]:
with open('/content/rouge_results/onmt_avg_scores.json', 'w') as res_json:
  json.dump(onmt_avg_scores, res_json)

In [None]:
# to evaluate Bert Extractive Summarizer
!pip install bert-extractive-summarizer
!pip install spacy==2.3.1
!pip install transformers
!pip install neuralcoref

In [None]:
!pip install sentencepiece

In [None]:
!python -m spacy download zh_core_web_lg

In [None]:
import spacy
import zh_core_web_lg
import neuralcoref

In [None]:
nlp = zh_core_web_lg.load()
neuralcoref.add_to_pipe(nlp)

from summarizer import Summarizer
from summarizer.sentence_handler import SentenceHandler
from spacy.lang.zh import Chinese
from transformers import *

# Load model, model config and tokenizer via Transformers
modelName = "bert-base-chinese" 
custom_config = AutoConfig.from_pretrained(modelName)
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained(modelName)
custom_model = AutoModel.from_pretrained(modelName, config=custom_config)

bert_model = Summarizer(
    custom_model=custom_model, 
    custom_tokenizer=custom_tokenizer,
    sentence_handler = SentenceHandler(language=Chinese)
    )

In [None]:
# to make a summary of BertExtractiveSummarizer more similar to standard summary
def post_process_bert_extractive(text):
  text = re.sub("\n", " ", text)
  sentences = text.split("。")
  tokenized_sentences = []
  for sent in sentences:
    sent = model(sent, target = "CWS")
    sent = " ".join(sent[0])
    tokenized_sentences.append(sent)

  whole_text = "。".join(tokenized_sentences)  
  whole_text += "。"
  return whole_text

In [None]:
# to get a summary from BertExtractiveSummarizer
bert_extractive_summaries = []
for index, text in enumerate(texts):
  print(index)
  raw_summary = bert_model(text, num_sentences = 8)
  summary = post_process_bert_extractive(raw_summary)
  bert_extractive_summaries.append(summary)

In [None]:
bert_extractive_rouge_scores = evaluate_predictions(summaries, bert_extractive_summaries)

In [None]:
bert_extractive_avg_scores = avg_score(bert_extractive_rouge_scores)

In [None]:
with open('/content/rouge_results/bert_extractive_avg_scores.json', 'w') as res_json:
  json.dump(bert_extractive_avg_scores, res_json)