In [297]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import pickle
from tensorflow.keras.optimizers import Adadelta
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, concatenate, Dropout, Input

In [298]:
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Flatten, Dropout, InputLayer, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import model_from_json
import numpy as np
import pandas as pd

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

from sklearn.manifold import TSNE
import plotly.express as px

In [299]:
import sys
sys.path.insert(1, '/scratch/cinthiasouza/mv-text-summarizer')

from src import summarization
import numpy as np
import re
import pickle

import itertools

In [300]:
def remove_ascii(text):
    
    return re.sub(r'[^\x00-\x7F]+', '', str(text))


In [320]:
def pipeline_summarization(
    features, scores, references, predictions, section, name_models,
    summ_items, k=3, sort_scores=True, proba=False, ascending=False):
    
    X_test = features
    y_test = scores
    
    #X_test['sentences'] = X_test['sentences'].str
    vfunc = np.vectorize(remove_ascii)
    X_test['sentences'] = vfunc(X_test['sentences'])
    #X_test['sentences'] = X_test['sentences'].apply(remove_ascii)
    
    df_proba = summarization.create_df(name_models, X_test, y_test['rouge_1'], predictions, section, proba=proba)
    
    if proba:
        df = summarization.binarize_proba(df_proba.copy(), name_models, k, sort_scores, ascending)
    else:
        df = df_proba.copy()
        
    summaries = summarization.create_summaries(df, references, summ_items, name_models)
    
    result = summarization.evaluate_summaries(summaries, name_models)
    
    return df_proba, df, summaries, result

# MV

In [302]:
def load_mlp(path_to_save, name_model, section):
    
    json_file = open('{}/{}_{}.json'.format(path_to_save, name_model, section), 'r')
    model = json_file.read()
    json_file.close()
    model = model_from_json(model)
    model.load_weights('{}/{}_{}.h5'.format(path_to_save, name_model, section))
    print("Loaded model from disk")
    model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(
                            learning_rate=0.001), metrics=[keras.metrics.Precision()])
    
    return model

In [303]:
def load_models(dataset, sections, name_models, columns, path_to_save, embed_path):
     
    predictions_proba = {}
    models = {}

    for section in sections:

        aux = {}
        aux_models = {}
        for name_model in name_models:

            model = load_mlp(path_to_save, name_model, section)
            
            if (name_model == 'mv_mlp_bert') or (name_model == 'mv_mlp'):
                X_test_embed = pd.read_csv("{}_{}_test.csv".format(embed_path, section))

                y_test_embed = X_test_embed['label']
                X_test_embed = X_test_embed[columns]
            
            X_test_features = dataset[section][1]
            y_test_features = to_categorical(dataset[section][3])

            if (name_model == 'mv_mlp_bert') or (name_model == 'mv_mlp'):
                y_pred = model.predict([X_test_embed, X_test_features])
            elif name_model == "mlp":
                y_pred = model.predict(X_test_features)

            aux[name_model] = y_pred
            aux_models[name_model] = model

        predictions_proba[section]= aux
        models[section] = aux_models
        
    return predictions_proba, models

In [315]:
X_test_embed = pd.read_csv("dataset/embed_bert_{}_test.csv".format( section))

y_test_embed = X_test_embed['label']
X_test_embed = X_test_embed[columns]

X_test_features = dataset[section][1]
#y_test_features = to_categorical(dataset[section][3])

In [313]:
folder_to_save = 'models_v1'
path_to_save = "/scratch/cinthiasouza/mv-text-summarizer/notebook/{}".format(folder_to_save)

sections=['introduction', 'materials', 'conclusion']
name_models = ['mv_mlp_bert']

In [306]:
summ_items = pd.read_csv("dataset/indices_summ.csv")['summ']

In [308]:
features_intro = pd.read_csv('dataset/dataset_introduction.csv')
features_mat = pd.read_csv('dataset/dataset_materials.csv')
features_conc = pd.read_csv('dataset/dataset_conclusion.csv')

In [309]:
references_df= pd.read_csv('dataset/references_df.csv')

In [310]:
columns = list(range(0, 383))
columns = list(map(str, columns))

In [311]:
with open('dataset/dataset_{}.pkl'.format('features'), 'rb') as fp:
    dataset = pickle.load(fp)

In [317]:
embed_path = 'dataset/embed_bert'
predictions_proba, models = load_models(dataset, sections, name_models, columns, path_to_save, embed_path)

Loaded model from disk
Loaded model from disk
Loaded model from disk


In [321]:
def create_summaries(df, references, articles, name_models):

    df_summaries = pd.DataFrame()

    
    for name_model in name_models:
        
        summaries = []
        articles_list = []

        for article in articles:
        
            try: 
                text = df.loc[df['articles'] == article]
                summaries.append(' '.join(text.loc[text[name_model] == 1]['sentences'].values))
                articles_list.append(article)
            except TypeError:
                summaries.append("")
        
        
        df_summaries[name_model] = summaries
    
    df_summaries['articles'] = articles
    
    df_summaries = df_summaries.merge(references, on='articles')
    
    return  df_summaries


In [319]:
def binarize_proba(df, name_models, k=3, sort_scores=True, ascending=False):
    
    grouped_df = df.groupby('articles')
    len(grouped_df)
    
    for name_model in name_models:

        labels = []
        j = 0
        for idx, group in grouped_df:

            labels.append(summarization.create_label(group.reset_index(drop=True), name_model, k, sort_scores, ascending))    
      
        merged = list(itertools.chain(*labels))
        df[name_model] = merged
        
    return df

In [191]:
import itertools

In [322]:
section = 'introduction'

df = dataset[section][5].reset_index(drop=True)
features_intro = df[['sentences', 'articles']]
scores_intro = pd.DataFrame()
scores_intro['rouge_1'] = df['rouge_1']

proba_intro_ex1, df_intro_ex1, summaries_intro_ex1, result_intro_ex1 = pipeline_summarization(
    features_intro, scores_intro, references_df, predictions_proba, section, name_models,
    summ_items, sort_scores=True, proba=True, ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['sentences'] = vfunc(X_test['sentences'])


In [323]:
section = 'materials'

df = dataset[section][5].reset_index(drop=True)
features_mat = df[['sentences', 'articles']]
scores_mat = pd.DataFrame()
scores_mat['rouge_1'] = df['rouge_1']

proba_mat_ex1, df_mat_ex1, summaries_mat_ex1, result_mat_ex1 = pipeline_summarization(
    features_mat, scores_mat, references_df, predictions_proba, section, name_models,
    summ_items, sort_scores=True, proba=True, ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['sentences'] = vfunc(X_test['sentences'])


In [324]:
section = 'conclusion'

df = dataset[section][5].reset_index(drop=True)
features_conc = df[['sentences', 'articles']]
scores_conc = pd.DataFrame()
scores_conc['rouge_1'] = df['rouge_1']

df_proba_conc_ex1, df_conc_ex1, summaries_conc_ex1, result_conc_ex1 = pipeline_summarization(
    features_conc, scores_conc, references_df, predictions_proba, section, name_models,
    summ_items, sort_scores=True, proba=True, ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['sentences'] = vfunc(X_test['sentences'])


In [325]:
summaries_intro_ex1.describe()

Unnamed: 0,mv_mlp_bert_r1,mv_mlp_bert_r2,mv_mlp_bert_rl
count,171.0,171.0,171.0
mean,0.262703,0.099953,0.171134
std,0.08191,0.058081,0.060759
min,0.071429,0.0,0.038647
25%,0.215716,0.053909,0.12965
50%,0.257426,0.093897,0.168831
75%,0.31894,0.139535,0.208396
max,0.471545,0.271429,0.354839


In [326]:
summaries_mat_ex1.describe()

Unnamed: 0,mv_mlp_bert_r1,mv_mlp_bert_r2,mv_mlp_bert_rl
count,171.0,171.0,171.0
mean,0.205834,0.09812,0.15274
std,0.138334,0.137775,0.132108
min,0.0,0.0,0.0
25%,0.110601,0.018738,0.073903
50%,0.163743,0.047431,0.114754
75%,0.269312,0.107018,0.176408
max,0.818182,0.815385,0.818182


In [327]:
summaries_conc_ex1.describe()

Unnamed: 0,mv_mlp_bert_r1,mv_mlp_bert_r2,mv_mlp_bert_rl
count,171.0,171.0,171.0
mean,0.27816,0.128414,0.190985
std,0.105228,0.114344,0.102635
min,0.035398,0.0,0.017699
25%,0.211465,0.05276,0.120606
50%,0.27451,0.092715,0.164179
75%,0.335768,0.160317,0.220789
max,0.578431,0.574257,0.578431


In [328]:
summaries_comb_ex1 = summarization.combine_three_summ(summaries_intro_ex1, summaries_mat_ex1, summaries_conc_ex1, references_df, name_models)

In [329]:
result_comb_ex1 = summarization.evaluate_summaries(summaries_comb_ex1, name_models)

In [330]:
result_comb_ex1.describe()

Unnamed: 0,mv_mlp_bert_r1,mv_mlp_bert_r2,mv_mlp_bert_rl
count,171.0,171.0,171.0
mean,0.38052,0.176018,0.230729
std,0.092544,0.101517,0.082924
min,0.106904,0.013423,0.066815
25%,0.310167,0.096272,0.166786
50%,0.378641,0.150685,0.213018
75%,0.443461,0.252142,0.283366
max,0.583691,0.476596,0.490706
