# Text Summarization using text rank algorithm

# Importing the libraries

In [9]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Uploading Data

In [10]:
# Upload the CSV file
from google.colab import files
uploaded = files.upload()

Saving skripsi_pendahuluan.csv to skripsi_pendahuluan.csv


In [11]:
with open('skripsi_pendahuluan.csv') as f:
    print(f)

<_io.TextIOWrapper name='skripsi_pendahuluan.csv' mode='r' encoding='UTF-8'>


In [12]:
import io
df = pd.read_csv('skripsi_pendahuluan.csv', index_col=0, sep=';', encoding='latin-1')


In [13]:
df.head()

Unnamed: 0_level_0,skripsi_judul,skripsi_isi,skripsi_link
skripsi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,SISTEM BERBASIS PENGETAHUAN UNTUK MENGIDENTIFI...,Burung kenari (Serinus Canaria) adalah salah s...,https://eprints.utdi.ac.id/7326/
2,\t\nPEMROGRAMAN APLIKASI MOBILE BERBASIS ANDR...,Office of International Affairs (OIA) atau Kan...,https://eprints.utdi.ac.id/7287/
3,APLIKASI PENGENALAN HAMA DAN PENYAKIT PADA TAN...,Pertanian merupakan kegiatan pemanfaatan sumbe...,https://eprints.utdi.ac.id/7301/
4,PEMETAAN LAHAN PERTANIAN TANAMAN ORGANIK DI WI...,Struktur tanah daerah pegunungan di kecamatan ...,https://eprints.utdi.ac.id/7242/
5,Tehnologi Platfrom Virtualisasi Untuk Aplikasi...,Pada jaman sekarang kebutuhan internet merupak...,https://eprints.utdi.ac.id/7274/


# Preprocessing the Data

In [14]:
# split the the text in the articles into sentences
sentences = []
for s in df['skripsi_isi']:
  sentences.append(sent_tokenize(s))  

In [15]:
# flatten the list
sentences = [y for x in sentences for y in x]
# for x in sentences:
#   for y in x:
#     sentences.append(y)

In [16]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

  clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")


In [17]:
nltk.download('stopwords')# one time execution

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
from nltk.corpus import stopwords
stop_words = stopwords.words('indonesian')

In [19]:
# function to remove stopwords
def remove_stopwords(sen):
  sen_new = " ".join([i for i in sen if i not in stop_words])
  return sen_new

# for i in sen:
#   if i not in stop_words:
#     sen_new += i

In [20]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

# Downloading the word embeddings

In [21]:
# download pretrained GloVe word embeddings
! wget http://nlp.stanford.edu/data/glove.6B.zip

--2023-01-13 13:08:44--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-01-13 13:08:44--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-01-13 13:08:45--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [22]:
! unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


# Extracting word vectors

In [23]:
# Extract word vectors
word_embeddings = {} # example {fox:.822455, .....}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

# Formation of sentence vectors

In [24]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [25]:
len(sentence_vectors)

114

# Finding similarities by using cosine similarity
The next step is to find similarities among the sentences. We will use cosine similarity to find similarity between a pair of sentences. Let's create an empty similarity matrix for this task and populate it with cosine similarities of the sentences.

In [26]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

# Forming Graph from similarity Matrix

In [30]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank_numpy(nx_graph)

  scores = nx.pagerank_numpy(nx_graph)


# Sorting and printing Summary

In [31]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [34]:
# Specify number of sentences to form the summary
sn = 10

# Generate summary
for i in range(sn):
  print(f"{i}.{ranked_sentences[i][1]}")

0.Ilmu itu meliputi tanaman dan cara perawatannya, lahan, pengairan, dan bahkan
metode pemasaran hasil pertanian.
1.Pracaya
(1999: 1) menyatakan untuk bisa berhasil petani perlu tahu ilmu bercocok tanam.
2.Yang terakhir,
metode pemasaran hasil pertanian, faktor ini menjadi indikator apakah sebuah
proses kegiatan pertanian ini berhasil atau tidak.
3.Saat hasil melebihi biaya
perawatan, kegiatan pertanian dikatakan berhasil, begitu pula sebaliknya.
4.Sistem pertanian organik adalah suatu sistem
produksi pertanian dimana bahan organik, baik makhluk hidup maupun yang
sudah mati, merupakan faktor penting dalam proses produksi.
5.Berdasarkan permasalahan tersebut dibutuhkan aplikasi yang bertujuan
untuk mempermudah pembeli hasil panen atau yang disebut sebagai pengepul
dalam melihat lahan pertanian organik yang berada dikecamatan Kokap.
6.Tak ayal saat ini sangat banyak hobis kicauan yang
berbondong-bondong mencari dan merawat calon-calon burung kenari yang akan
diperlombakan.
7.Dari situlah

# End