# Text Summarization Menggunakan Metode Maximum Marginal Relevance (MMR)


Link repo github kode: https://github.com/fajri91/Text-Summarization-MMR

In [None]:
# Ketik kode disini
!pip install sastrawi

# Import modul (tools)

In [None]:
# Ketik kode disini
import re
import requests
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import operator

# Tahap Stemming pada teks berbahasa Indonesia

In [None]:
# Ketik kode disini
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Load stopwords list dari repo github, yang akan dipakai untuk menghapus stopwords yang ada di teks

url: https://raw.githubusercontent.com/Wayan123/Sentiment-Analysis/main/stopwordlist.txt

In [None]:
# Ketik kode disini
def load_stopWords():
  url = "https://filesamples.com/samples/document/txt/sample3.txt"
  ina_stopword = requests.get(url).content
  return ina_stopword.split()

stopwords = load_stopWords()

# Kita bisa melihat list stopwords dengan run stopwords

In [None]:
# Ketik kode disini
stopwords

## Membuat fungsi untuk stemmer

In [None]:
# Ketik kode diisni
def cleanData(sentence):
  ret = []
  sentence = stemmer.stem(sentence)
  for word in sentence.split():
    if not word in stopwords:
      ret.append(word)
  return " ".join(ret)

## Membuat fungsi untuk vektor kata

In [None]:
# Ketik kode diisni
def getVectorSpace(cleanSet):
  vocab = {}
  for data in cleanSet:
    for word in data.split():
      vocab[data] = 0
  return vocab.keys()

## Membuat fungsi untuk menghitung cosine similarity

In [None]:
# Fungsi untuk menghitung cosine similarity
def calculateSimilarity(sentence, doc):
  if doc == []:
    return 0
  vocab = {}
  for word in sentence:
    vocab[word] = 0

  docInOneSentence = '';
  for t in doc:
    docInOneSentence += (t + ' ')
    for word in t.split():
      vocab[word]=0
  
  cv = CountVectorizer(vocabulary=vocab.keys())

  docVector = cv.fit_transform([docInOneSentence])
  sentenceVector = cv.fit_transform([sentence])
  return cosine_similarity(docVector, sentenceVector)[0][0]


# Load raw data teks dari repo github

url: https://raw.githubusercontent.com/fajri91/Text-Summarization-MMR/master/news_data4.txt

In [None]:
# Load raw data 
r = requests.get('https://filesamples.com/samples/document/txt/sample3.txt')
r.encoding = r.apparent_encoding
texts = r.text.split('\n')

In [None]:
# Menampilkan text yang akan di ringkas
texts


In [None]:
sentences = []
clean = []
originalSentenceOf = {}

for line in texts:
  parts = line.split('.')
  for part in parts:
    cl = cleanData(part)
    sentences.append(part)
    clean.append(cl)
    originalSentenceOf[cl] = part
setClean = set(clean)

## Menghitung nilai cosine similarity

In [None]:
#calculate Similarity score each sentence with whole documents		
scores = {}
for data in clean:
  temp_doc = setClean - set([data])
  score = calculateSimilarity(data, list(temp_doc))
  scores[data] = score

In [None]:
#calculate MMR
n = 20 * len(sentences) / 100
alpha = 0.5
summarySet = []
while n > 0:
  mmr = {}


  for sentence in scores.keys():
    if not sentence in summarySet:
      mmr[sentence] = alpha * scores[sentence] - (1-alpha) * calculateSimilarity(sentence, summarySet)
      selected = max(mmr.items(), key=operator.itemgetter(1))[0]
      summarySet.append(selected)
      n -= 1

## Menampilkan hasil Summary dari hasil perhitungan Algoritma MMR

In [None]:
# Menampilkan hasil Summary
print ('\nSummary (hasil teks yang diringkas):\n')
for sentence in summarySet:
  print (originalSentenceOf [sentence].lstrip(' '))
print('')

## Menampilkan perbandingan Summary text dengan teks asli sebelum si summary. Setiap kalimat yang di jadikan sebagai summary akan di highlight merah.

In [None]:
print ('==========================================================================')
print ('\nOriginal Passages (Teks Asli):\n')
from termcolor import colored

for sentence in clean:
  if sentence in summarySet:
    print (colored(originalSentenceOf[sentence].lstrip(' '), 'yellow'))
  else:
    print (originalSentenceOf[sentence].lstrip(' '))