In [37]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    sentences = sent_tokenize(text)
    return sentences

def calculate_word_frequency(sentences):
    word_freq = FreqDist()
    for sentence in sentences:
        words = word_tokenize(sentence)
        for word in words:
            if word not in stop_words:
                word_freq[word] += 1
    return word_freq

def calculate_sentence_scores(word_freq, sentences):
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        words = word_tokenize(sentence)
        score = 0
        for word in words:
            if word in word_freq:
                score += word_freq[word]
        sentence_scores[i] = score
    return sentence_scores

def calculate_similarity_matrix(sentences):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])

    return similarity_matrix

def sentence_similarity(sent1, sent2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([sent1, sent2])
    similarity = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))
    return similarity[0][0]

def apply_text_rank(similarity_matrix, num_iters=100, d=0.85):
    scores = np.ones(len(similarity_matrix)) / len(similarity_matrix)

    for _ in range(num_iters):
        scores = (1 - d) + d * np.dot(similarity_matrix.T, scores)

    return scores

def generate_summary(sentences, scores, N=3):
    top_sentences = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    summary = [sentences[i] for i in top_sentences]
    return ' '.join(summary)

def main(text):

    sentences = preprocess_text(text)

    # Extractive summarization using word frequency
    word_freq = calculate_word_frequency(sentences)
    sentence_scores_freq = calculate_sentence_scores(word_freq, sentences)

    # TextRank-based summarization
    similarity_matrix = calculate_similarity_matrix(sentences)
    scores_text_rank = apply_text_rank(similarity_matrix)

    # Combine scores from both methods
    combined_scores = [0.5 * scores_text_rank[i] + 0.5 * sentence_scores_freq[i] for i in range(len(sentences))]

    # Generate the final summary
    N = 3  # Change N to the desired number of summary sentences
    final_summary = generate_summary(sentences, combined_scores, N)

    return final_summary

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
import pandas as pd

filename = '/content/drive/MyDrive/CAPSTONE/Dataset/summarization/BANDUNG_altero-bistronomie-cipaganti.csv'
filename2 ='/content/drive/MyDrive/CAPSTONE/Dataset/summarization/BANDUNG_akasya-teras-riau.csv'
filename3 = '/content/drive/MyDrive/CAPSTONE/Dataset/summarization/BANDUNG_sambal-lalap-wong-solo-dipatiukur.csv'

In [39]:
import csv

def combine(file_name):
    with open(file_name, 'r') as file_csv:
        reader = csv.reader(file_csv)
        column = [row[2] for row in reader]
    pg = ' '.join(column)
    return pg

text1 = combine(filename)
text2 = combine(filename2)
text3 = combine(filename3)
print(text1)

top-part ini sih bener bener diatas level bandung makanan makanannya berani untuk beda semuanya lekoh dan sangat sangat otentik untuk harga memang lumayan menguras kantong kalau beli dry agednya tapi worth it banget diajak boss makan di resto ini karena kalau beli sendiri kayanya gak kuat hehejujur ini enak sekali dagingnya empuk dan lucu nya di lihatkan dulu mentahnya seperti apa Altero Bistronomie• Seafood Capellini 85kRasanya ini unik banget, dia bumbunya pake sedikit kuah kaldu dari kerang, diatasnya ada cumi yang digoreng tepung, empuk gak alot sama sekali dan gak amis, pastanya pake capellini tipis gitu mirip angel hair• Pan-Seared Dumplings 50kIni dumplingsnya garing banget malah kaya yang di fried gitu, isian dumplingsnya lumayan tebel dan enak• The Rabbit 45kBasically ini jus wortel yang dicampur sama nanas, rasanyaa ok• Kombucha (free)First time cobain kombucha rasanya ternyata enak ya, asem seger, lumayan strong sih asemnyaMereka punya lahan parkir lumayan luas, aksesnya jug

In [40]:
summary1 = main(text1)
summary2 = main(text2)
summary3 = main(text3)
print(summary1)

Tempatnya sendiri ada indoor dan semi outdoor area, ada foyer-nya juga buat nunggu, beneran serasa lagi table manner-an.Karena dateng pas panas-panasnya siang bolong jadi nyoba menu yang seger-seger, ada rekomendasi salad yang Apple Blaukraut Salad (65K), sesuai namanya ini perpaduan apple dan red cabbage, tapi kalau dari tampilannya lebih dominan di green apple dan red apple, blaukraut-nya hanya sebagian kecil, ada lemon zest, cheese, mayo dan vinaigrette dressing-nya juga jadi ada hint rasa asem dan creamy-nya, fresh dan seger-nya pun ngena.Dessert pilih rekomendasi mereka yang West Java Kopi Susu (65K), sejenis sponge cake dengan tekstur yang super duper lembut, chocolate tuile yang gurih dan disajiin dengan espresso sauce yang bitter, unik banget si rasanya, bittersweet-nya balace pas dan melted banget di mulut.Minumannya sendiri nyoba Golden Fleece (50K) yang disajiin di gelas tiki, ini juga salah satu speciality rekomendasi mereka, perpaduan non-alcoholic gold rum, orange juice d

In [41]:
def count_words(text):
    number = len(text.split())
    return number

count_text1 = count_words(text1) - 1
count_text2 = count_words(text2) - 1
count_text3 = count_words(text3) - 1

count_summary1 = count_words(summary1)
count_summary2 = count_words(summary2)
count_summary3 = count_words(summary3)

In [43]:
text_counts = [count_text1, count_text2, count_text3]
summary_counts = [count_summary1, count_summary2, count_summary3]

df = pd.DataFrame({
    'number of text words': text_counts,
    'number of summary words': summary_counts
})

df

Unnamed: 0,number of text words,number of summary words
0,2401,369
1,1663,105
2,1289,141
