# TDDE16 - Project 2021

By Thomas Guillaume (thogu155)

## Subject: Automatic Text Summarization with extractive methods

In [111]:
# Import
import numpy as np
import pandas as pd
import spacy
import re
import ast  # for literal string parsing

In [112]:
NUM_DEV_TEXT = 10
NUM_TEXT = 500
MIN_SENTENCES_BY_TEXT = 10
DEV_MODE = False

In [113]:
# Load Spacy
nlp = spacy.load('en_core_web_sm')

### Step 0 - Get brut data, clean and get sentences 

In [114]:
# To get full dataset

# Open dataset file
with open('datasets/dataset-wikihow-brut.csv', 'r') as source:
    df = pd.read_csv(source)
    if DEV_MODE:
        df = df[:NUM_DEV_TEXT]
    else:
        df = df[:NUM_TEXT]
    df = df.dropna()

In [115]:
def clean_text(text):
    text = text.replace("\n", "")
    text = text.strip()
    text = text.replace(".,", ". ")
    text = text.replace(".;", ". ")
    text = text.replace(".;,", ". ")
    return text

In [116]:
def get_sentences(text: str):
    text = nlp(text)
    sentences = list(text.sents)
    sentences = list(map(lambda sentence: sentence.as_doc().text.strip(), sentences))
    return sentences

In [117]:
def find_sentences(df: pd.DataFrame):
    df["sentences"] = ''
    for idx, row in df.iterrows():
        row['plot'] = clean_text(row['plot'])
        row['summary'] = clean_text(row['summary'])
        df.at[idx, 'sentences'] = get_sentences(row['plot'])
    return df

Filter data by number of sentences in text because we want to remove small text.

In [118]:
df = find_sentences(df)

In [119]:
def analyse_dataset(df):
    total_sentences = 0
    total_doc_less_10_sentences = 0
    max_sentences = 0 
    min_sentences = 100000
    for idx, row in df.iterrows():
        length = len(row['sentences'])
        total_sentences += length
        if length > max_sentences:
            max_sentences = length
        if length < min_sentences:
            min_sentences = length
        if length > 10:
            total_doc_less_10_sentences = total_doc_less_10_sentences + 1
    average_sentences = round(total_sentences / len(df))
    data = {
             "Total numb er of sentences": [total_sentences], 
             "number of docs with more than 10 sentences": [total_doc_less_10_sentences],
             "avg sentences/doc": [average_sentences],
             "max sentences/doc": [max_sentences],
             "min sentences/doc": [min_sentences]
           }
    return pd.DataFrame(data)

In [120]:
analyse_dataset(df)

Unnamed: 0,Total numb er of sentences,number of docs with more than 10 sentences,avg sentences/doc,max sentences/doc,min sentences/doc
0,19970,452,40,208,1


In [121]:
if DEV_MODE:
    df.to_csv(r'./datasets/dataset-dev.csv', index=False)
else:
    df.to_csv(r'./datasets/dataset.csv', index=False)
    analyse_dataset(df).to_csv(r'./datasets/dataset-analyse.csv', index=False)

# Use data already filter

In [122]:
filename = 'dataset-dev' if DEV_MODE else 'dataset'

with open(f'datasets/{filename}.csv', 'r') as source:
    df = pd.read_csv(source)
    # convert back the list of sentences from string representation in csv file
    for i, row in df.iterrows():
        df.at[i, 'sentences'] = ast.literal_eval(row['sentences'])

#### Remove docs with less than 10 sentences

In [123]:
def remove_small_docs(df):
    """
        filter to get only plot with more than 10 sentences
    """
    return df[df.apply(lambda x: len(x['sentences']) > MIN_SENTENCES_BY_TEXT, axis=1)]

In [124]:
df = remove_small_docs(df)

### Step 1 - Input matrix creation

In [125]:
def preprocess(text):
    # tokenization
    doc = nlp(text)
    # stop word removal
    doc = [token for token in doc if not token.is_stop]
    # lemmatization
    doc = [token.lemma_ for token in doc if token.lemma_.isalpha()]
    return doc 

In [126]:
def log_entropy_vectorizer(docs, tokenizer=preprocess):
    """
    perform log entropy vectorizer as stated in 
    ref: https://en.wikipedia.org/wiki/Latent_semantic_indexing
        g_i = 1 + sum (p_ij * log(p_ij))/log(n)
    """
    from sklearn.feature_extraction.text import CountVectorizer
    from scipy.sparse import spdiags
    X = CountVectorizer(tokenizer=preprocess).fit_transform(docs)
    n_samples, n_features = X.shape
    gf = np.array(X.sum(axis=0)).ravel() # count total number of words
    P = (X * spdiags(1./gf, diags=0, m=n_features, n=n_features)) # probability matrix
    p = P.data
    P.data = 1 + (p * np.log(p) / np.log(n_samples))
    g = np.array(P.sum(axis=0)).ravel()
    X.data = np.log(1 + X.data)
    G = spdiags(g, diags=0, m=n_features, n=n_features)
    return np.array((X * G).todense())

In [127]:
def root_type_vectorizer(docs, tokenizer=preprocess):
    """
        Assign frequency if word is a noun and 0 otherwise
    """
    from sklearn.feature_extraction.text import CountVectorizer
    
    vectorizer = CountVectorizer(tokenizer=preprocess)
    X = np.array(vectorizer.fit_transform(docs).todense())
    n_samples, n_features = X.shape
    
    def check_noun(feature):
        doc = nlp(feature)
        return doc[0].tag_ == 'NNP'
    
    features_names = np.array(list(map(check_noun, vectorizer.get_feature_names_out())))
    for sampleIdx in range(0, n_samples):
        for featureIdx in range(0, n_features):
            if not features_names[featureIdx]:
                X[sampleIdx][featureIdx] = 0
    return X

#### Create TF-IDF matrix

In [128]:
def create_matrix(sentences: list, method_cell = "tf-idf"):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.feature_extraction.text import CountVectorizer
    
    if method_cell == "tf-idf":
        matrix = np.array(TfidfVectorizer(tokenizer=preprocess).fit_transform(sentences).T.todense())
    elif method_cell == "freq":
        matrix = np.array(CountVectorizer(tokenizer=preprocess).fit_transform(sentences).T.todense())
    elif method_cell == "binary":
        matrix = np.array(CountVectorizer(tokenizer=preprocess, binary=True).fit_transform(sentences).T.todense())
    elif method_cell == "log":
        matrix = log_entropy_vectorizer(sentences, tokenizer=preprocess).T
    elif method_cell == "root":
        matrix = root_type_vectorizer(sentences, tokenizer=preprocess).T
    
    return matrix

### Step 2 - Compute  Singular Value Decomposition

Singular Value Decomposition: SVD is an algebraic method that can model relationships among words/phrases and sen-tences. In this method, the given input matrixAis decomposed into three new matrices as follows:

In [129]:
def compute_SVD(matrix, num_concepts=5):
    from numpy.linalg import svd as singular_value_decomposition
    u, sigma, vt = singular_value_decomposition(matrix, full_matrices=False)
    return (u, sigma, vt)

### Step 3 - Sentence selection approaches

In [130]:
def baseline_strategy(sentences, num_sentences):
    """
        Pick the n first sentences to create the summary
    """
    best_sentences = []
    for idx in range(0, num_sentences):
            best_sentences.append(sentences[idx])
    return best_sentences

In [131]:
def gongliu_strategy(sentences, num_sentences, vt):
    """
        Gong and Liu strategy.
        Sentences are chosen until a predefined number of sentences have been collected.
    """
    best_sentences_idx = []
    best_sentences = []
    v = vt[:num_sentences]
    # for each topic (row)
    for idx, row in enumerate(v):
        # get the sentence with highest score (max value in the row => get the index to retrieve the sentence)
        best_sentences_idx.append(row.argmax())
    best_sentences_idx.sort()
    
    for idx in best_sentences_idx:
        best_sentences.append(sentences[idx])
    
    return best_sentences

In [132]:
def sj_strategy(sentences, num_sentences, vt, sigma):
    """
        Steinberger and Jezek strategy
    """
    best_sentences = []
    v = vt.T # get transpose of vt
    # for each sentences compute the length s
    saliency_vec = np.array(np.dot(np.square(sigma), np.square(v))).flatten()
    best_sentences_idx = saliency_vec.argsort()[-num_sentences:][::-1]
    # Return the sentences in the order in which they appear in the document
    best_sentences_idx.sort()
    for idx in best_sentences_idx:
        best_sentences.append(sentences[idx])
    return best_sentences

In [133]:
def murray_strategy(sentences, num_sentences, vt, sigma):
    """
        Murray et al. strategy.
    """
    best_sentences_idx = []
    best_sentences = []
    
    # if sigma equal to empty vector => gongliu strategy
    if sigma.sum() == 0:
        # for each concepts (row)
        for row in vt[:num_sentences]:
            # get the sentence with highest score (max value in the row => get the index to retrieve the sentence)
            best_sentences.append(sentences[row.argmax()])
        return best_sentences
  
    # get percentage
    num_sentences_by_topic = round((sigma.max() / sigma.sum()) * 100)
    
    i = 0 # count max sentences to pick by topic
    for row in vt[:num_sentences]:
        # for each topic calculate the n best sentences to pick where n = num_sentences_by_topic
        best_sentences_by_topic = np.array(row).flatten().argsort()[-num_sentences_by_topic:][::-1]
        
        # append to best_sentences_idx and check if summary is finish
        for sentence_idx in best_sentences_by_topic:
            best_sentences_idx.append(sentence_idx)
            i = i + 1
            if i == num_sentences:
                break
        if i == num_sentences:
                break
    
    # Return the sentences in the order in which they appear in the document
    best_sentences_idx.sort()
    # find sentences and build summary
    for idx in best_sentences_idx:
        best_sentences.append(sentences[idx])
    return best_sentences

In [134]:
def cross_strategy(sentences, num_sentences, vt, sigma):
    """
        Cross strategy by Ozsoy et al.
    """
    best_sentences = []
    
    topic_averages = np.array(vt.mean(axis=1)).flatten()
    vt = np.array(vt)
    # Set sentences whose scores fall below the topic average to zero
    # This removes less related sentences from each concept
    for topic_ndx, topic_avg in enumerate(topic_averages):
        vt[topic_ndx, vt[topic_ndx, :] <= topic_avg] = 0
    v = vt.T # get transpose of vt
    # for each sentences compute the length s
    saliency_vec = np.array(np.dot(np.square(sigma), np.square(v))).flatten()
    best_sentences_idx = saliency_vec.argsort()[-num_sentences:][::-1]
    # Return the sentences in the order in which they appear in the document
    best_sentences_idx.sort()
    for idx in best_sentences_idx:
        best_sentences.append(sentences[idx])
    return best_sentences    

In [135]:
def topic_strategy(sentences, num_sentences, vt, sigma):
    """
        Topic strategy by Ozsoy et al.
    """
    best_sentences = []
    
    vt = np.array(vt)
    topic_averages = np.array(vt.mean(axis=1)).flatten()
    # Set sentences whose scores fall below the topic average to zero
    # This removes less related sentences from each concept
    for topic_ndx, topic_avg in enumerate(topic_averages):
        vt[topic_ndx, vt[topic_ndx, :] <= topic_avg] = 0

    def find_common_sentences_score(topic1_idx, topic2_idx):
        sentences1 = vt[topic1_idx]
        sentences2 = vt[topic2_idx]
        sumSentences = 0
        for idx, s in enumerate(sentences1):
            if s != 0 and sentences2[idx] != 0:
                sumSentences += (s + sentences2[idx])
        return sumSentences

    # create topic x topic matrix
    topic_matrix = np.zeros(shape=(len(topic_averages),len(topic_averages)))
    for topic_ndx, topic_avg in enumerate(topic_averages):
        for t in range(0, len(topic_averages)):
            topic_matrix[topic_ndx][t] = find_common_sentences_score(topic_ndx, t)

    strength = np.array(topic_matrix.sum(axis=1)).flatten()

    main_topic_index = strength.argmax()

    best_sentences_idx = vt[main_topic_index].argsort()[-num_sentences:][::-1]
    # Return the sentences in the order in which they appear in the document
    best_sentences_idx.sort()
    for idx in best_sentences_idx:
        best_sentences.append(sentences[idx])
    return best_sentences

In [136]:
def select_sentences(sentences: list, u: np.matrix, sigma: np.ndarray, vt: np.matrix, num_sentences=5, strategy = "gongliu"):
    """
        Select sentences by using a strategy
    """
    # Baseline strategy
    if strategy == "baseline":
        return baseline_strategy(sentences, num_sentences)
    # Gong and Liu strategy
    elif strategy == "gongliu":
        return gongliu_strategy(sentences, num_sentences, vt)
    # Steinberger and Jezek strategy
    elif strategy == "sj":
        return sj_strategy(sentences, num_sentences, vt, sigma)
    # Murray et al.
    elif strategy == "mrc":
        return murray_strategy(sentences, num_sentences, vt, sigma)
    elif strategy == "cross":
        return cross_strategy(sentences, num_sentences, vt, sigma)
    elif strategy == "topic":
        return topic_strategy(sentences, num_sentences, vt, sigma)
        
    return best_sentences

### Summarize

In [137]:
def summarize_text(text, strategy="gongliu", method_cell="tf-idf", num_sentences=5):
    # get all sentences
    sentences = text['sentences']
    # build matrix td-idf
    matrix = create_matrix(sentences, method_cell=method_cell)
    # compute SVD
    u, sigma, vt = compute_SVD(matrix)
    # select best sentences
    best_sentences = select_sentences(sentences=sentences, u=u, sigma=sigma, vt=vt, strategy=strategy, num_sentences=num_sentences)
    summary = " ".join(best_sentences)
    """
    print(f"====== Original plot of movie {movie['title']} in {movie['year']} =====")
    print(movie['plot'])
    print("====== End of original text =====")



    print(f"\n========= Summary of movie {movie['title']} in {movie['year']} =========")
    print(summary)
    print("========= End of summary =========")
    
    print(f"\n========= Best Summary of movie {movie['title']} in {movie['year']} =========")
    print(movie['summary'])
    print("========= End of summary =========")
    """
    
    from rouge import Rouge
    rouge = Rouge()
    score = rouge.get_scores(summary, text['summary'])
   
    return (summary, score[0])
    
    

In [138]:
text = df.iloc[0]
strategy = "cross"
method_cell = "freq"
num_sentences = len(get_sentences(text['summary']))
summary, score = summarize_text(text=text, strategy=strategy, method_cell=method_cell, num_sentences=num_sentences)

### Evaluation

#### ROUGE

In [139]:
def evaluate_dataset(df, strategy="gongliu", method_cell="tf-idf"):
    scores = []
    
    # recall, precision, f1-score
    rouge1 = [0, 0, 0]
    rouge2 = [0, 0, 0]
    rougeL = [0, 0, 0]
    
    for index, row in df.iterrows():
        # print(strategy, method_cell, index, movie)
        summary, score = summarize_text(text=row, strategy=strategy, method_cell=method_cell)
        
        rouge1[0] += round(score['rouge-1']['r'], 3)
        rouge1[1] += round(score['rouge-1']['p'], 3)
        rouge1[2] += round(score['rouge-1']['f'], 3)
        
        rouge2[0] += round(score['rouge-2']['r'], 3)
        rouge2[1] += round(score['rouge-2']['p'], 3)
        rouge2[2] += round(score['rouge-2']['f'], 3)
        
        rougeL[0] += round(score['rouge-l']['r'], 3)
        rougeL[1] += round(score['rouge-l']['p'], 3)
        rougeL[2] += round(score['rouge-l']['f'], 3)

        
    for i in range(0, 3):
        rouge1[i] /= len(df);
        rouge2[i] /= len(df);
        rougeL[i] /= len(df);
        
    result_info = {
                    "strategy": strategy,
                    "method_cell": method_cell, 
                    "ROUGE-1 P": rouge1[0],
                    "ROUGE-1 R": rouge1[1],
                    "ROUGE-1 F1": rouge1[2],
                    "ROUGE-2 P": rouge2[0],
                    "ROUGE-2 R": rouge2[1],
                    "ROUGE-2 F1": rouge2[2],
                    "ROUGE-L P": rougeL[0],
                    "ROUGE-L R": rougeL[1],
                    "ROUGE-L F1": rougeL[2],
                  }
    
    return (strategy, method_cell, result_info)
    

In [140]:
method_cells = ['freq', 'binary', 'tf-idf', 'log', 'root']
strategies = ['baseline', "gongliu", "sj", "mrc", "cross", "topic"]

In [141]:
list_df_rouge1_P = []
list_df_rouge1_R = []
list_df_rouge1_F1 = []

list_df_rouge2_P = []
list_df_rouge2_R = []
list_df_rouge2_F1 = []

list_df_rougeL_P = []
list_df_rougeL_R = []
list_df_rougeL_F1 = []

for strategy in strategies:
    
    rouge1_P = []
    rouge1_R = []
    rouge1_F1 = []

    rouge2_P = []
    rouge2_R = []
    rouge2_F1 = []

    rougeL_P = []
    rougeL_R = []
    rougeL_F1 = []
    
    for method_cell in method_cells:
        strategy, method_cell, result_info = evaluate_dataset(df=df, strategy=strategy, method_cell=method_cell)        
        rouge1_P.append(result_info["ROUGE-1 P"])
        rouge1_R.append(result_info["ROUGE-1 R"])
        rouge1_F1.append(result_info["ROUGE-1 F1"])
        
        rouge2_P.append(result_info["ROUGE-2 P"])
        rouge2_R.append(result_info["ROUGE-2 R"])
        rouge2_F1.append(result_info["ROUGE-2 F1"])
        
        rougeL_P.append(result_info["ROUGE-L P"])
        rougeL_R.append(result_info["ROUGE-L R"])
        rougeL_F1.append(result_info["ROUGE-L F1"])
        print(f"finish strategy = {strategy} with method = {method_cell}")
        
    list_df_rouge1_P.append(pd.DataFrame(rouge1_P, index=method_cells, columns=[strategy]))          
    list_df_rouge1_R.append(pd.DataFrame(rouge1_R, index=method_cells, columns=[strategy]))
    list_df_rouge1_F1.append(pd.DataFrame(rouge1_F1, index=method_cells, columns=[strategy]))

    list_df_rouge2_P.append(pd.DataFrame(rouge2_P, index=method_cells, columns=[strategy]))          
    list_df_rouge2_R.append(pd.DataFrame(rouge2_R, index=method_cells, columns=[strategy]))
    list_df_rouge2_F1.append(pd.DataFrame(rouge2_F1, index=method_cells, columns=[strategy]))
                             
    list_df_rougeL_P.append(pd.DataFrame(rougeL_P, index=method_cells, columns=[strategy]))         
    list_df_rougeL_R.append(pd.DataFrame(rougeL_R, index=method_cells, columns=[strategy]))
    list_df_rougeL_F1.append(pd.DataFrame(rougeL_F1, index=method_cells, columns=[strategy]))
                             

finish strategy = baseline with method = freq
finish strategy = baseline with method = binary
finish strategy = baseline with method = tf-idf
finish strategy = baseline with method = log
finish strategy = baseline with method = root
finish strategy = gongliu with method = freq
finish strategy = gongliu with method = binary
finish strategy = gongliu with method = tf-idf
finish strategy = gongliu with method = log
finish strategy = gongliu with method = root
finish strategy = sj with method = freq
finish strategy = sj with method = binary
finish strategy = sj with method = tf-idf
finish strategy = sj with method = log
finish strategy = sj with method = root
finish strategy = mrc with method = freq
finish strategy = mrc with method = binary
finish strategy = mrc with method = tf-idf
finish strategy = mrc with method = log
finish strategy = mrc with method = root
finish strategy = cross with method = freq
finish strategy = cross with method = binary
finish strategy = cross with method = tf

In [142]:
df_result_rouge1_P= pd.concat(list_df_rouge1_P, axis=1)
df_result_rouge1_R = pd.concat(list_df_rouge1_R, axis=1)
df_result_rouge1_F1 = pd.concat(list_df_rouge1_F1, axis=1)

df_result_rouge2_P= pd.concat(list_df_rouge2_P, axis=1)
df_result_rouge2_R = pd.concat(list_df_rouge2_R, axis=1)
df_result_rouge2_F1 = pd.concat(list_df_rouge2_F1, axis=1)

df_result_rougeL_P= pd.concat(list_df_rougeL_P, axis=1)
df_result_rougeL_R = pd.concat(list_df_rougeL_R, axis=1)
df_result_rougeL_F1 = pd.concat(list_df_rougeL_F1, axis=1)

In [146]:
prefix = "_dev" if DEV_MODE else ""
folder_prefix = "/dev/" if DEV_MODE else "/"
df_result_rouge1_P.to_csv(fr'./results{folder_prefix}rouge1/rouge1_P{prefix}.csv', index=False)
df_result_rouge1_R.to_csv(fr'./results{folder_prefix}rouge1/rouge1_R{prefix}.csv', index=False)
df_result_rouge1_F1.to_csv(fr'./results{folder_prefix}rouge1/rouge1_F1{prefix}.csv', index=False)

df_result_rouge2_P.to_csv(fr'./results{folder_prefix}rouge2/rouge2_P{prefix}.csv', index=False)
df_result_rouge2_R.to_csv(fr'./results{folder_prefix}rouge2/rouge2_R{prefix}.csv', index=False)
df_result_rouge2_F1.to_csv(fr'./results{folder_prefix}rouge2/rouge2_F1{prefix}.csv', index=False)

df_result_rougeL_P.to_csv(fr'./results{folder_prefix}rougeL/rougeL_P{prefix}.csv', index=False)
df_result_rougeL_R.to_csv(fr'./results{folder_prefix}rougeL/rougeL_R{prefix}.csv', index=False)
df_result_rougeL_F1.to_csv(fr'./results{folder_prefix}rougeL/rougeL_F1{prefix}.csv', index=False)

In [160]:
df_result_rouge1_P.round(3)

Unnamed: 0,baseline,gongliu,sj,mrc,cross,topic
freq,0.29,0.311,0.291,0.235,0.295,0.312
binary,0.29,0.311,0.294,0.238,0.289,0.307
tf-idf,0.29,0.284,0.29,0.239,0.29,0.305
log,0.29,0.308,0.295,0.199,0.295,0.291
root,0.29,0.248,0.297,0.289,0.297,0.294


In [163]:
df_result_rouge1_R.round(3)

Unnamed: 0,baseline,gongliu,sj,mrc,cross,topic
freq,0.174,0.163,0.17,0.16,0.173,0.169
binary,0.174,0.158,0.172,0.16,0.176,0.169
tf-idf,0.174,0.193,0.173,0.176,0.174,0.183
log,0.174,0.173,0.176,0.159,0.177,0.171
root,0.174,0.191,0.172,0.173,0.173,0.172


In [165]:
df_result_rouge1_F1.round(3)

Unnamed: 0,baseline,gongliu,sj,mrc,cross,topic
freq,0.196,0.195,0.195,0.17,0.198,0.2
binary,0.196,0.193,0.196,0.172,0.197,0.199
tf-idf,0.196,0.205,0.197,0.181,0.197,0.206
log,0.196,0.202,0.199,0.154,0.2,0.196
root,0.196,0.186,0.197,0.195,0.198,0.197


In [167]:
df_result_rouge2_P.round(3)

Unnamed: 0,baseline,gongliu,sj,mrc,cross,topic
freq,0.066,0.064,0.061,0.036,0.062,0.07
binary,0.066,0.061,0.06,0.037,0.059,0.063
tf-idf,0.066,0.06,0.06,0.042,0.062,0.068
log,0.066,0.065,0.063,0.024,0.059,0.056
root,0.066,0.05,0.063,0.06,0.063,0.06


In [171]:
df_result_rouge2_R.round(3)

Unnamed: 0,baseline,gongliu,sj,mrc,cross,topic
freq,0.032,0.029,0.032,0.022,0.032,0.032
binary,0.032,0.028,0.031,0.022,0.033,0.03
tf-idf,0.032,0.036,0.031,0.028,0.032,0.035
log,0.032,0.032,0.032,0.017,0.032,0.029
root,0.032,0.032,0.031,0.03,0.031,0.031


In [173]:
df_result_rouge2_F1.round(3)

Unnamed: 0,baseline,gongliu,sj,mrc,cross,topic
freq,0.037,0.035,0.037,0.024,0.037,0.039
binary,0.037,0.034,0.036,0.024,0.037,0.036
tf-idf,0.037,0.039,0.036,0.029,0.037,0.04
log,0.037,0.038,0.037,0.017,0.036,0.033
root,0.037,0.032,0.036,0.034,0.037,0.036


In [153]:
df_result_rougeL_P

Unnamed: 0,baseline,gongliu,sj,mrc,cross,topic
freq,0.273843,0.29242,0.276429,0.222896,0.278226,0.295312
binary,0.273843,0.2914,0.277133,0.224819,0.272544,0.290449
tf-idf,0.273843,0.268617,0.273513,0.226527,0.273195,0.286726
log,0.273843,0.289918,0.277746,0.18904,0.278934,0.27465
root,0.273843,0.233454,0.280018,0.274232,0.280739,0.278049


In [154]:
df_result_rougeL_R

Unnamed: 0,baseline,gongliu,sj,mrc,cross,topic
freq,0.162538,0.152314,0.159854,0.150872,0.161954,0.157644
binary,0.162538,0.146491,0.160697,0.150644,0.164852,0.157982
tf-idf,0.162538,0.181582,0.16098,0.165582,0.162279,0.17048
log,0.162538,0.161577,0.164239,0.15129,0.165473,0.159717
root,0.162538,0.178387,0.160316,0.162058,0.16127,0.161173


In [178]:
df_result_rougeL_F1.round(3)

Unnamed: 0,baseline,gongliu,sj,mrc,cross,topic
freq,0.184,0.182,0.184,0.161,0.186,0.188
binary,0.184,0.179,0.184,0.162,0.185,0.186
tf-idf,0.184,0.193,0.184,0.17,0.184,0.193
log,0.184,0.189,0.187,0.146,0.188,0.183
root,0.184,0.174,0.184,0.183,0.186,0.185


In [176]:
df_result_rougeL_F1.mean().round(3)

baseline    0.184
gongliu     0.183
sj          0.185
mrc         0.165
cross       0.186
topic       0.187
dtype: float64