<a href="https://colab.research.google.com/github/Atfssene/FRASA/blob/main/Text_Summarization_Model_FRASA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Summarization Model

In this notebook, we will create a model for text summarization task. TextRank and SumBasic will be our feature extraction from senteces to create a weights that will be feeded to a neural networks. Let's start!

## Import library

In [362]:
!pip install Sastrawi
# !pip install fasttext



In [363]:
# Import library
import pandas as pd
import numpy as np
import re
import networkx as nx
import tensorflow as tf
import nltk
nltk.download('punkt')

from tensorflow.keras.optimizers import RMSprop
from nltk.tokenize import sent_tokenize, word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

# For pre trained text embedding from FastText
# import gzip
# import fasttext
# import fasttext.util

factory = StopWordRemoverFactory()
stop_words = factory.get_stop_words()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read data

In [364]:
train = tf.keras.utils.get_file('train.csv', 'https://raw.githubusercontent.com/Atfssene/FRASA/main/Text%20Summarization/train.csv')
test = tf.keras.utils.get_file('test.csv', 'https://raw.githubusercontent.com/Atfssene/FRASA/main/Text%20Summarization/test.csv')

df_train = pd.read_csv(train, dtype=object, converters={'labels':eval})
df_test = pd.read_csv(test, dtype=object, converters={'labels':eval})
df_train.info()
# df_test.info()

  after removing the cwd from sys.path.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15012 entries, 0 to 15011
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   labels      15012 non-null  object
 1   paragraphs  15012 non-null  object
 2   summary     15012 non-null  object
dtypes: object(3)
memory usage: 352.0+ KB


  """


## TextRank

Load pretrained words embeddings

In [365]:
word_embeddings = {}
with open('/content/drive/MyDrive/model_summarization/cc.id.vec', encoding='utf-8') as file:
  for f in file:
      values = f.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      word_embeddings[word] = coefs

len(word_embeddings)
# <output> 400001

# For sorting return list
def sorting(e):
  return e[2]

TextRank Algorithm

In [366]:
def TextRank(sentences, processed):
    sentence_vectors = []
    for i in processed:
      if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((300,))) for w in i.split()])/(len(i.split())+0.001)
      else:
        v = np.zeros((300,))
      sentence_vectors.append(v)

    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
      for j in range(len(sentences)):
        if i != j:
          sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,300), sentence_vectors[j].reshape(1,300))[0,0]

    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank_numpy(nx_graph)

    ranked_sentences = sorted(([scores[i],i+1,s] for i,s in enumerate(sentences)), reverse=True)

    text_rank = []
    for index, sentence in enumerate(ranked_sentences):
      sentence.insert(1, index+1)
      text_rank.append(sentence)

    # Return list(TextRank weights, TextRank order, sentence order, sentence) => text_rank
    text_rank = sorted(text_rank,key=sorting)

    TR_weight = []
    TR_order = []
    for i in range(len(text_rank)):
      TR_weight.append(text_rank[i][0])
      TR_order.append(text_rank[i][1])
    # Just Return 2 list(TextRank weights, TextRank order)
    return TR_weight, TR_order

Example result from variable text_rank:


```
[0.05728266277281181, 7, 1, 'Jakarta, CNN Indonesia - - Dokter Ryan Thamrin, yang terkenal lewat acara Dokter Oz Indonesia, meninggal dunia pada Jumat (4 / 8) dini hari.']
[0.060217967742173646, 2, 2, 'Dokter Lula Kamal yang merupakan selebriti sekaligus rekan kerja Ryan menyebut kawannya itu sudah sakit sejak setahun yang lalu.']
[0.06045704497658339, 1, 3, 'Lula menuturkan, sakit itu membuat Ryan mesti vakum dari semua kegiatannya, termasuk menjadi pembawa acara Dokter Oz Indonesia.']
[0.05209530959721178, 15, 4, 'Kondisi itu membuat Ryan harus kembali ke kampung halamannya di Pekanbaru, Riau untuk menjalani istirahat. "']
[0.05840185989845859, 4, 5, 'Setahu saya dia orangnya sehat, tapi tahun lalu saya dengar dia sakit.']
[0.05637943589724592, 10, 6, '( Karena) sakitnya, ia langsung pulang ke Pekanbaru, jadi kami yang mau jenguk juga susah.']
[0.05646634343163575, 9, 7, 'Barangkali mau istirahat, ya betul juga, kalau di Jakarta susah isirahatnya, " kata Lula kepada CNNIndonesia.com, Jumat (4 / 8).']
[0.058732876811454365, 3, 8, 'Lula yang mengenal Ryan sejak sebelum aktif berkarier di televisi mengaku belum sempat membesuk Ryan lantaran lokasi yang jauh.']
[0.055390279489910994, 12, 9, 'Dia juga tak tahu penyakit apa yang diderita Ryan. "']
[0.053688773306564144, 14, 10, 'Itu saya enggak tahu, belum sempat jenguk dan enggak selamanya bisa dijenguk juga.']
[0.05747455457427858, 6, 11, 'Enggak tahu berat sekali apa bagaimana, " tutur Ryan.']
[0.055102006046349405, 13, 12, 'Walau sudah setahun menderita sakit, Lula tak mengetahui apa penyebab pasti kematian Dr Oz Indonesia itu.']
[0.05763968547747263, 5, 13, 'Meski demikian, ia mendengar beberapa kabar yang menyebut bahwa penyebab Ryan meninggal adalah karena jatuh di kamar mandi.']
[0.05690594567038913, 8, 14, '“ Saya tidak tahu, barangkali penyakit yang dulu sama yang sekarang berbeda, atau penyebab kematiannya beda dari penyakit sebelumnya.']
[0.048397823134637225, 18, 15, 'Kita kan enggak bisa mengambil kesimpulan, " kata Lula.']
[0.056112156670864166, 11, 16, 'Ryan Thamrin terkenal sebagai dokter yang rutin membagikan tips dan informasi kesehatan lewat tayangan Dokter Oz Indonesia.']
[0.04974842217789095, 16, 17, 'Ryan menempuh Pendidikan Dokter pada tahun 2002 di Fakultas Kedokteran Universitas Gadjah Mada.']
[0.049506852324067645, 17, 18, 'Dia kemudian melanjutkan pendidikan Klinis Kesehatan Reproduksi dan Penyakit Menular Seksual di Mahachulalongkornrajavidyalaya University, Bangkok, Thailand pada 2004.']
```



## SumBasic

In [417]:
def SumBasic(sentences, processed):
  # Count the sum weights
  frequency = {}
  for text in processed:
    for word in word_tokenize(text):
      if word not in frequency.keys():
        frequency[word]=1
      else:
        frequency[word]+=1
  max_fre = max(frequency.values())
  for word in frequency.keys():
      frequency[word]=(frequency[word]/max_fre)
  
  # Score the weight for every sentence
  scores = {}
  for i, sentence in enumerate(processed):    
    if (len(sentence) != 0):
      for word in word_tokenize(sentence):  
        if word in frequency.keys():
            if i not in scores.keys():
              scores[i] = frequency[word]            
            else:
              scores[i] += frequency[word]
    else:
      scores[i] = 0.0000001

  ranked_sentences = sorted(([scores[i],i+1,s] for i,s in enumerate(sentences)), reverse=True)

  # Return list(SumBasic weights, SumBasic order, sentence order, sentence) => sum_bas
  sum_bas = []
  for index, sentence in enumerate(ranked_sentences):
    sentence.insert(1, index+1)
    sum_bas.append(sentence)

  sum_bas = sorted(sum_bas,key=sorting)

  SB_weight = []
  SB_order = []
  for i in range(len(sum_bas)):
    SB_weight.append(sum_bas[i][0])
    SB_order.append(sum_bas[i][1])
  # Just Return 2 list(TextRank weights, TextRank order)
  return SB_weight, SB_order

Example result from variable sum_bas :


```
[4.727272727272726, 1, 0, 'Jakarta, CNN Indonesia - - Dokter Ryan Thamrin, yang terkenal lewat acara Dokter Oz Indonesia, meninggal dunia pada Jumat (4 / 8) dini hari.']
[3.909090909090908, 4, 1, 'Dokter Lula Kamal yang merupakan selebriti sekaligus rekan kerja Ryan menyebut kawannya itu sudah sakit sejak setahun yang lalu.']
[4.09090909090909, 2, 2, 'Lula menuturkan, sakit itu membuat Ryan mesti vakum dari semua kegiatannya, termasuk menjadi pembawa acara Dokter Oz Indonesia.']
[1.9999999999999998, 12, 3, 'Kondisi itu membuat Ryan harus kembali ke kampung halamannya di Pekanbaru, Riau untuk menjalani istirahat. "']
[1.0909090909090908, 17, 4, 'Setahu saya dia orangnya sehat, tapi tahun lalu saya dengar dia sakit.']
[0.9090909090909092, 18, 5, '( Karena) sakitnya, ia langsung pulang ke Pekanbaru, jadi kami yang mau jenguk juga susah.']
[2.0, 11, 6, 'Barangkali mau istirahat, ya betul juga, kalau di Jakarta susah isirahatnya, " kata Lula kepada CNNIndonesia.com, Jumat (4 / 8).']
[3.7272727272727266, 5, 7, 'Lula yang mengenal Ryan sejak sebelum aktif berkarier di televisi mengaku belum sempat membesuk Ryan lantaran lokasi yang jauh.']
[1.8181818181818183, 13, 8, 'Dia juga tak tahu penyakit apa yang diderita Ryan. "']
[1.6363636363636362, 14, 9, 'Itu saya enggak tahu, belum sempat jenguk dan enggak selamanya bisa dijenguk juga.']
[2.090909090909091, 9, 10, 'Enggak tahu berat sekali apa bagaimana, " tutur Ryan.']
[2.0909090909090904, 10, 11, 'Walau sudah setahun menderita sakit, Lula tak mengetahui apa penyebab pasti kematian Dr Oz Indonesia itu.']
[2.2727272727272725, 7, 12, 'Meski demikian, ia mendengar beberapa kabar yang menyebut bahwa penyebab Ryan meninggal adalah karena jatuh di kamar mandi.']
[2.181818181818181, 8, 13, '“ Saya tidak tahu, barangkali penyakit yang dulu sama yang sekarang berbeda, atau penyebab kematiannya beda dari penyakit sebelumnya.']
[1.2727272727272727, 16, 14, 'Kita kan enggak bisa mengambil kesimpulan, " kata Lula.']
[3.9090909090909083, 3, 15, 'Ryan Thamrin terkenal sebagai dokter yang rutin membagikan tips dan informasi kesehatan lewat tayangan Dokter Oz Indonesia.']
[2.545454545454545, 6, 16, 'Ryan menempuh Pendidikan Dokter pada tahun 2002 di Fakultas Kedokteran Universitas Gadjah Mada.']
[1.636363636363636, 15, 17, 'Dia kemudian melanjutkan pendidikan Klinis Kesehatan Reproduksi dan Penyakit Menular Seksual di Mahachulalongkornrajavidyalaya University, Bangkok, Thailand pada 2004.']
```



## Main Process

In [368]:
# take in row [label, paragraphs, summary] => use apply
# for labels convert False/True to 0/1
# for paragraphs and summary, clean the data, 

def Process(rows, axis):
  sentences = []
  processed = []
  for row in sent_tokenize(rows['paragraphs']):
    sentences.append(sent_tokenize(row))
  sentences = [y for x in sentences for y in x]
  # sentences = tokenize raw paragraph <list>

  for text in sentences:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = re.sub(r"\b\w{1,3}\b"," ",text)
    text = " ".join([word for word in text.split() if not word in stop_words])
    processed.append(text)
  # processed = tokenize cleaned text <list>

  # Axis for return
  # 0 = golden labels
  # 1 = TextRank value
  # 2 = TextRank order
  # 3 = SumBasic
  # 4 = SumBasic order

  if axis == 0:
     # Convert labels False/True into 0/1
    labels = []
    for label in rows['labels']:
      if label == True:
        labels.append(1)
      elif label == False:
        labels.append(0)
    return labels
  elif axis == 1:
    # Calling TextRank
    textrank, _ = TextRank(sentences, processed)
    return textrank
  elif axis == 2:
    # Calling TextRank
    _, textrank_order = TextRank(sentences, processed)
    return textrank_order
  elif axis == 3:
    # Calling SumBasic
    sumbasic, _ = SumBasic(sentences, processed)
    return sumbasic
  elif axis == 4:
    # Calling SumBasic
    _, sumbasic_order = SumBasic(sentences, processed)
    return sumbasic_order
  elif axis == 5:
    return sentences
  elif axis == 6:
    return processed


Pre-processing raw text for feature extraction with rules:
1. Splits paragraphs into sentences.
2. Lowercasing letter.
3. Remove punctuation.
4. Remove stopword.
5. Remove non alphanumerical letter.

In [369]:
# Run the main process
df_train['labels'] = df_train.apply(lambda row: Process(row, 0), axis=1)
df_train['TextRank'] = df_train.apply(lambda row: Process(row, 1), axis=1)
df_train['TextRank_order'] = df_train.apply(lambda row: Process(row, 2), axis=1)
df_train['SumBasic'] = df_train.apply(lambda row: Process(row, 3), axis=1)
df_train['SumBasic_order'] = df_train.apply(lambda row: Process(row, 4), axis=1)

In [421]:
df_train['SumBasic'] = df_train.apply(lambda row: Process(row, 3), axis=1)

*Estimated running time until this cell: 55m 40s*

Save the feature extracted data

In [422]:
columns = ["TextRank", "TextRank_order", "SumBasic", "SumBasic_order","labels"]
df_train.to_csv('extracted_train.csv',
             columns=columns,
             header=columns,
             index=False)

In [423]:
train_df = pd.read_csv('/content/extracted_train.csv', converters={'labels':eval,'TextRank':eval,'TextRank_order':eval,'SumBasic':eval,'SumBasic_order':eval})

train_df.head()

Unnamed: 0,TextRank,TextRank_order,SumBasic,SumBasic_order,labels
0,"[0.05728266277281181, 0.060217967742173646, 0....","[7, 2, 1, 15, 4, 10, 9, 3, 12, 14, 6, 13, 5, 8...","[4.727272727272726, 3.909090909090908, 4.09090...","[1, 4, 2, 12, 17, 18, 11, 5, 13, 14, 9, 10, 7,...","[False, True, True, True, False, False, False,..."
1,"[0.026138283329021877, 0.02563616928768256, 0....","[10, 17, 22, 21, 16, 6, 37, 20, 15, 2, 24, 31,...","[2.2999999999999994, 1.6500000000000001, 0.850...","[15, 19, 33, 20, 14, 13, 2, 18, 23, 6, 11, 38,...","[False, False, False, False, False, True, True..."
2,"[0.04179513271518677, 0.0421326008108034, 0.04...","[14, 10, 19, 5, 22, 6, 12, 8, 13, 1, 16, 17, 9...","[5.111111111111111, 2.9444444444444438, 5.4999...","[5, 12, 3, 1, 22, 8, 9, 6, 13, 10, 7, 24, 19, ...","[True, True, False, False, False, False, False..."
3,"[0.09797460546421852, 0.10349622916295231, 0.0...","[6, 3, 8, 7, 9, 1, 2, 10, 4, 5, 11]","[7.800000000000002, 6.6000000000000005, 1.0, 1...","[1, 3, 10, 8, 6, 2, 4, 9, 5, 7, 11]","[True, True, False, False, False, True, False,..."
4,"[0.07450063348091279, 0.08897369766666585, 0.0...","[10, 1, 7, 12, 6, 4, 5, 8, 11, 9, 2, 3, 13]","[1.9230769230769231, 3.846153846153846, 2.1538...","[8, 4, 7, 12, 9, 5, 2, 6, 11, 10, 1, 3, 13]","[False, True, True, True, True, False, False, ..."


Weight normalization before feed it into Neural Network

In [424]:
def normalization(list_weight):
  norm = np.array(list_weight)
  norm = norm.reshape(1,-1)
  norm = preprocessing.normalize(norm)
  norm = [item for sublist in norm for item in sublist]
  return norm

In [425]:
train_df['TextRank'] = train_df.apply(lambda row: normalization(row['TextRank']), axis=1)
train_df['SumBasic'] = train_df.apply(lambda row: normalization(row['SumBasic']), axis=1)

train_df.head()

Unnamed: 0,TextRank,TextRank_order,SumBasic,SumBasic_order,labels
0,"[0.24255901797208415, 0.2549883404992296, 0.25...","[7, 2, 1, 15, 4, 10, 9, 3, 12, 14, 6, 13, 5, 8...","[0.41623982322717157, 0.3441983153609304, 0.36...","[1, 4, 2, 12, 17, 18, 11, 5, 13, 14, 9, 10, 7,...","[False, True, True, True, False, False, False,..."
1,"[0.16484753481259254, 0.16168082868780004, 0.1...","[10, 17, 22, 21, 16, 6, 37, 20, 15, 2, 24, 31,...","[0.16815219798411393, 0.12063092464077743, 0.0...","[15, 19, 33, 20, 14, 13, 2, 18, 23, 6, 11, 38,...","[False, False, False, False, False, True, True..."
2,"[0.20445256790497116, 0.2061033873725153, 0.19...","[14, 10, 19, 5, 22, 6, 12, 8, 13, 1, 16, 17, 9...","[0.2935987466856812, 0.1691384084167511, 0.315...","[5, 12, 3, 1, 22, 8, 9, 6, 13, 10, 7, 24, 19, ...","[True, True, False, False, False, False, False..."
3,"[0.3137515673717481, 0.33143388496523324, 0.30...","[6, 3, 8, 7, 9, 1, 2, 10, 4, 5, 11]","[0.5004114211179124, 0.4234250486382335, 0.064...","[1, 3, 10, 8, 6, 2, 4, 9, 5, 7, 11]","[True, True, False, False, False, True, False,..."
4,"[0.2604685931716319, 0.31106921884700495, 0.29...","[10, 1, 7, 12, 6, 4, 5, 8, 11, 9, 2, 3, 13]","[0.11590855399509058, 0.23181710799018113, 0.1...","[8, 4, 7, 12, 9, 5, 2, 6, 11, 10, 1, 3, 13]","[False, True, True, True, True, False, False, ..."


In [426]:
# Flattening all sentences!

flat_df = pd.DataFrame(columns=columns)
for column in columns:
  flat_df[column] = train_df.explode(column, ignore_index=True)[column]
flat_df

Unnamed: 0,TextRank,TextRank_order,SumBasic,SumBasic_order,labels
0,0.242559,7,0.41624,1,False
1,0.254988,2,0.344198,4,True
2,0.256001,1,0.360208,2,True
3,0.220594,15,0.176101,12,True
4,0.247298,4,0.0960553,17,False
...,...,...,...,...,...
268914,0.208422,2,0.307507,3,False
268915,0.194041,15,0.159056,15,False
268916,0.203506,6,0.243885,5,False
268917,0.189631,16,0.0477167,25,False


In [427]:
flat_df.describe()

Unnamed: 0,TextRank,TextRank_order,SumBasic,SumBasic_order,labels
count,268919.0,268919,268919.0,268919,268919
unique,267459.0,76,231904.0,76,2
top,0.031662,1,0.008289,1,False
freq,14.0,15012,14.0,15012,218673


In [461]:
import altair as alt
sample = flat_df.tail(5000)
# sample = flat_df.sample(n=5000, random_state=2020)
alt.Chart(sample).mark_circle().encode(
    x='SumBasic_order',
    y='TextRank_order',
    color='labels'
)

## Neural Network

In [462]:
model = tf.keras.models.Sequential([tf.keras.layers.Dense(4, input_shape=[4]), 
                                    tf.keras.layers.Dense(8, activation='relu'), 
                                    tf.keras.layers.Dense(1, activation='sigmoid')
                                    ])

model.compile(optimizer = RMSprop(lr=0.001), 
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_8 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 9         
Total params: 69
Trainable params: 69
Non-trainable params: 0
_________________________________________________________________


In [474]:
flat_df['labels'] = flat_df['labels'].apply(lambda row: 1 if row == True else 0)

In [478]:
target_column = 'labels'

feature_columns = ['TextRank', 'TextRank_order', 'SumBasic', 'SumBasic_order']

x_train = flat_df[feature_columns].to_numpy()
x_train = np.asarray(x_train).astype(np.float32)
y_train = flat_df[target_column].to_numpy()
print(type(x_train))
print(x_train.shape)
print(x_train[1])
print(x_train[1].shape)
print(type(y_train))
print(y_train.shape)
print(y_train[1])

<class 'numpy.ndarray'>
(268919, 4)
[0.25498834 2.         0.34419832 4.        ]
(4,)
<class 'numpy.ndarray'>
(268919,)
1


In [479]:
DESIRED_ACCURACY = 0.99
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epochs, logs={}) :
        if(logs.get('accuracy') is not None and logs.get('accuracy') >= DESIRED_ACCURACY) :
            print('\nReached 99.9% accuracy so cancelling training!')
            self.model.stop_training = True

callbacks = myCallback()


history = model.fit(x_train, y_train, epochs=100, callbacks=[callbacks])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78