In [1]:
import sys
import glob, os
import json
import pickle
import numpy as np
import pandas as pd
sys.path.insert(1, '/scratch/cinthiasouza/mv-text-summarizer')


In [2]:
%load_ext autoreload
%autoreload 2

from src import main_create_dataset
from src import preprocess
from src import extract_features
from src import tokenizer
from src import create_features_df
from src import transform_data
from src import loader
from src import utils

In [10]:
def create_label(df):
    
    aux = []
    grouped = df.groupby('articles')

    for idx, group in grouped:
    
        label = [0 for i in range(len(group))]
        group = group.reset_index(drop=True)
        j = 0
        for index, row in group.sort_values('rouge_1', ascending=False).iterrows():
            
            label[index] = 1
            j +=1

            if j==3:
                break
        group['label'] = label
        aux.append(group)
        
    
    return pd.concat(aux)

In [9]:
def convert_to_list(x):
    
    return x.strip('][').split(', ')

In [3]:
data_path = 'result_plosonev2'

In [4]:
path_base = f'/scratch/cinthiasouza/mv-text-summarizer/{data_path}'

In [5]:
path_to_read= f'/scratch/cinthiasouza/mv-text-summarizer/{data_path}'

In [6]:
path_to_save= f'/scratch/cinthiasouza/mv-text-summarizer/{data_path}'

In [7]:
sections = ['introduction', 'materials', 'conclusion']

In [11]:
for section in sections:

    paths_features = glob.glob(f'{path_base}/{section}/features_*.csv')

    
    #dfs_features = [pd.read_csv(i) for i in paths_features]
    
    dfs_features = []
    
    for i in paths_features:
        try:
            dfs_features.append(pd.read_csv(i))
        except pd.errors.EmptyDataError:
            pass

    features = pd.concat(dfs_features).reset_index(drop=True)
    
    features['rouges'].apply(convert_to_list)
    rouge_scores = pd.DataFrame(features['rouges'].apply(convert_to_list).tolist(), columns=['rouge_1','rouge_2', 'rouge_l'])
    features = pd.concat([features, rouge_scores], axis=1)
    
    features = create_label(features)

    features.to_csv(f"{path_base}/dataset_{section}.csv", index=False)

In [14]:
import random

dataset_intro = pd.read_csv(f'{path_base}/dataset_introduction.csv')
dataset_materials = pd.read_csv(f'{path_base}/dataset_materials.csv')
dataset_conclusion = pd.read_csv(f'{path_base}/dataset_conclusion.csv')

intro = pd.unique(dataset_intro['articles'])
mat = pd.unique(dataset_materials['articles'])
conc = pd.unique(dataset_conclusion['articles'])

comuns = list((set(intro) & set(mat)) & set(conc))

valid_len = int(len(comuns)*0.2)
summ_items = random.sample(comuns, valid_len)

df = pd.DataFrame({'summ': summ_items})
df.to_csv(f"/scratch/cinthiasouza/mv-text-summarizer/{data_path}/indices_summ.csv", index=False)

In [15]:
train_columns = ['lsa_rank', 'text_rank', 'lex_rank', 'count_postag',
       'count_ner', 'sentence_len', 'count_one_gram', 'count_two_gram',
       'count_three_gram', 'tfisf', 'position_score', 'dist_centroid']
    
under_columns = ['sentences', 'articles', 'lsa_rank', 'text_rank', 'lex_rank', 'count_postag',
       'count_ner', 'sentence_len', 'count_one_gram', 'count_two_gram',
       'count_three_gram', 'tfisf', 'position_score', 'dist_centroid', 'rouge_1', 'label']

In [16]:
dataset = main_create_dataset.main(
    train_columns, under_columns, sections, summ_items=summ_items, path_to_read=f'/scratch/cinthiasouza/mv-text-summarizer/{data_path}', 
    path_to_save=f'/scratch/cinthiasouza/mv-text-summarizer/{data_path}', 
    name_csv="features", format_dataset=True, verbose=True)

Preparando dataset para os classificadores
Treinamento dos modelos
Duration: 0:01:37.596720


In [36]:
with open('{}/dataset_{}.pkl'.format(path_to_save, 'features'), 'wb') as fp:
    pickle.dump(dataset, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Create Sentence Embeddings

In [37]:
with open('{}/dataset_{}.pkl'.format(path_to_read,'features'), 'rb') as fp:
    dataset = pickle.load(fp)

In [18]:
import spacy
nlp_md = spacy.load('en_core_web_md')

In [19]:
def create_embeedings(sentence):
    
    doc = nlp_md(sentence)
    
    return doc.vector

In [20]:
bases = ['X_train_nf', 'X_test_nf']

In [None]:
columns = [str(i) for i in range(300)]

for section in sections:
    for base in bases:

        embeddings_df = pd.DataFrame()
        embeddings_df['sentences'] = dataset[section][base]['sentences']
        embeddings_df['articles'] = dataset[section][base]['articles']
        embeddings_df['label'] = dataset[section][base]['label']

        embeddings_df ['result'] = embeddings_df['sentences'].apply(create_embeedings)

        embeddings_df[columns] = pd.DataFrame(embeddings_df.result.tolist(), index= embeddings_df.index)

        embeddings_df = embeddings_df.drop('result', axis = 1)

        dataset[section][base.replace("nf", 'embedd')]=embeddings_df
        
with open('{}/dataset_{}.pkl'.format(path_to_save, 'features'), 'wb') as fp:
    pickle.dump(dataset, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Create References DF

In [26]:
paths = glob.glob("/scratch/cinthiasouza/plosone/train/*.txt")

In [None]:
summ_items = pd.read_csv(f"/scratch/cinthiasouza/mv-text-summarizer/{data_path}/indices_summ.csv")['summ'].tolist()

In [27]:
summ_items = [str(i).replace("\n", "")for i in summ_items]

In [28]:
len(pd.unique(summ_items))

2608

In [29]:
codes = []
references = []

for path in paths:
    
    print(path)
    
    with open(path, 'r') as f:
        files = f.readlines()

    for file in files:

        data = json.loads(file)
        code = data.get('id') 
        if str(code).replace("\n", "") in summ_items:
            print(code)
            codes.append(code)
            references.append(data.get('abstract'))
                              
references_df = pd.DataFrame()
references_df['articles'] = codes
references_df['references'] = references
#references_df['references'] = references_df['references'].replace("<S>", "")
#references_df['references'] = references_df['references'].replace("</S>", "")


/scratch/cinthiasouza/plosone/train/batch_40000.txt
19041499
26497624
30337765
24174707
17676506
28339306
23733444
31992494
30633112
29471284
29351998
26523321
22517310
28868528
20159126
31823248
29634661
20046966
21798716
25652454
24817258
29224160
20953316
23366153
26780348
26990822
30344458
30859172
21036445
19404729
24771634
21787822
33716358
21601569
24211337
20194449
19361544
24476472
22998693
26558723
21954911
29721937
30072533
33214798
29938249
19823856
25308853
26038953
22367405
31477784
19524612
19084033
26617474
31715388
28503002
23041620
18956493
19570621
21994068
20360595
24670844
25432173
31262666
24468022
28452675
29565241
18804043
25285389
25355423
23483333
24980246
23879685
20123335
29056813
23907443
26232892
21128132
26791259
21703824
19255771
26740727
23452680
32807654
25382888
26279500
19533772
20976812
30537073
25588153
23399963
24470430
27327779
25040290
33010726
26891053
19124030
20045711
25295329
22546549
20051930
18848604
27798610
18801474
29046597
20730025
214

20116205
29996679
25304379
32383076
24692227
20493953
/scratch/cinthiasouza/plosone/train/batch_100000.txt
28825859
22495215
21460583
20186126
31833918
27871075
20433625
26900038
20605212
22495800
32062441
23816386
25549701
28433422
28661969
21446723
22424019
21062645
24963894
25582580
29214512
21661002
24311820
19716629
29096528
26308239
25194229
20838777
28930784
24269133
23114061
21094944
24685842
30586682
19057421
20207026
19637103
27271189
21132074
23379262
30882533
23300226
21088255
27922568
22641389
24870023
21132060
21838506
24631141
29063601
27084025
18687054
27047654
27000328
33564752
21862928
23995490
28792228
22000068
27909986
23705761
31111121
26317766
33108344
29782238
24832961
19120088
29623007
19690147
24374535
22042372
27647671
20889937
28383199
30358386
30593645
21360723
25797633
18957678
26484395
32910021
31887305
19702760
27722021
24242196
19101235
22882812
31768417
29369415
25644748
23031747
19661794
30954728
20625536
20186699
20851170
21649680
24518259
30350640
26

25914199
26530227
26505234
19931436
23055238
30543547
24893105
27002509
23219480
27457252
19833435
29962510
19329104
30335669
20731335
20564074
32153312
28567714
23374414
26141454
25689301
26384973
28818439
28567574
23630220
24478380
26362355
20804841
19130463
23721899
26213891
29570159
19086303
22425605
29968601
20101071
21391238
32645578
29797556
31424102
23396964
30218781
20223598
30349112
26091896
30529736
22290556
21427355
31121171
19833158
25818173
27523440
32524616
19527169
21648016
26866901
20642376
25200001
29053449
27917018
19739925
32343516
31632509
21516303
29040167
20478742
31951026
19474685
21866412
21421630
30919006
30394058
23972695
20167528
24101059
24084219
26193424
25189518
23380347
26162857
20571136
30395555
21736869
22915265
29239900
24870580
27542703
27330196
27747258
21821293
18689622
18672562
30218303
25468647
28166114
25207497
21466807
28153571
29056872
25419002
31551665
20482945
31431381
24983707
23478805
30069744
28626278
30216275
19928778
20505085
28765196
2

In [31]:

references_df.to_csv(f"/scratch/cinthiasouza/mv-text-summarizer/{data_path}/references_df.csv", index=False)