In [1]:
import pandas as pd
import re
import nltk
import numpy as np
import networkx as nx
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))  # Chuyển thành tập hợp (set) để tìm kiếm nhanh hơn
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
txt_file_path = "./text.txt"  # Thay thế bằng đường dẫn file TXT của bạn
with open(txt_file_path, "r", encoding="utf-8") as file:
    text = file.read()

In [4]:
print("Dữ liệu gốc:")
print(text)  # hiển thị  dữ liệu gốc

Dữ liệu gốc:
The mechanism of pattern recognition in the brain is
little known, and it seems to be almost impossible to
reveal it only by conventional physiological experiments. So, we take a slightly different approach to this
problem. If we could make a neural network model
which has the same capability for pattern recognition
as a human being, it would give us a powerful clue to
the understanding of the neural mechanism in the
brain. In this paper, we discuss how to synthesize a
neural network model in order to endow it an ability of
pattern recognition like a human being.
Several models were proposed with this intention
(Rosenblatt, 1962; Kabrisky, 1966; Giebel, 1971;
Fukushima, 1975). The response of most of these
models, however, was severely affected by the shift in
position and/or by the distortion in shape of the input
patterns. Hence, their ability for pattern recognition
was not so high.
In this paper, we propose an improved neural
network model. The structure of this networ

In [5]:
# Danh sách stopwords có điều chỉnh (giữ lại một số từ quan trọng)
custom_stopwords = set(stopwords.words('english')) - {"not", "without"}

# Khởi tạo bộ lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Tiền xử lý văn bản:
    - Chuyển thành chữ thường
    - Tách câu
    - Loại bỏ stopwords nhưng giữ lại từ quan trọng
    - Dùng lemmatizer để giảm từ về dạng gốc
    """
    sentences = sent_tokenize(text.lower())  # Chuyển thành chữ thường và tách câu
    processed_sentences = []

    for s in sentences:
        words = word_tokenize(s)  # Tách từ
        words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in custom_stopwords]  
        processed_sentences.append(" ".join(words))  # Ghép lại thành câu

    return processed_sentences
# Xử lý văn bản để lấy danh sách câu
sentences = preprocess_text(text)

In [6]:
print(sentences)

['mechanism pattern recognition brain little known seems almost impossible reveal conventional physiological experiment', 'take slightly different approach problem', 'could make neural network model capability pattern recognition human would give u powerful clue understanding neural mechanism brain', 'paper discus synthesize neural network model order endow ability pattern recognition like human', 'several model proposed intention rosenblatt kabrisky giebel fukushima', 'response model however severely affected shift position distortion shape input pattern', 'hence ability pattern recognition not high', 'paper propose improved neural network model', 'structure network suggested visual nervous system vertebrate', 'network learning without teacher acquires ability recognize stimulus pattern based geometrical similarity gestalt shape without affected position small distortion shape', 'network given nickname neocognitron l extention cognitron also multilayered neural network model proposed 

In [7]:
# Tải GloVe embeddings
word_embeddings = {}
with open("../glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs

In [8]:
# Hàm chuyển câu thành vector
def sentence_to_vector(sentence, embedding_dict, dim=100):
    """
    Chuyển đổi câu thành vector bằng cách tính trung bình vector của các từ trong câu.
    """
    words = sentence.split()
    if len(words) == 0:
        return np.zeros((dim,))
    return sum([embedding_dict.get(w, np.zeros((dim,))) for w in words]) / (len(words) + 0.001)


In [9]:
# Chuyển danh sách câu thành vector
sentence_vectors = [sentence_to_vector(s, word_embeddings) for s in sentences]

# Tạo ma trận tương đồng
sim_mat = np.zeros((len(sentences), len(sentences)))
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, 100), 
                                              sentence_vectors[j].reshape(1, 100))[0, 0]

In [10]:
# Xây dựng đồ thị và tính điểm PageRank
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

# Xếp hạng câu dựa trên điểm PageRank
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

# Xác định số câu cần tóm tắt
summary_length = 10
summary = "\n".join([ranked_sentences[i][1] for i in range(summary_length)])

In [11]:
# In kết quả tóm tắt
print("=== TÓM TẮT VĂN BẢN ===")
print(summary)

=== TÓM TẮT VĂN BẢN ===
completion selforganization response cell deepest layer network dependent upon shape stimulus pattern not affected position pattern presented
extended hierarchy model cell highest stage supposed respond specific stimulus pattern without affected position size stimulus
would not however completely deny hierarchy model consider hierarchy model represents main stream information flow visual system
network learning without teacher acquires ability recognize stimulus pattern based geometrical similarity gestalt shape without affected position small distortion shape
could make neural network model capability pattern recognition human would give u powerful clue understanding neural mechanism brain
also ability unsupervised learning not need teacher process selforganization needed present set stimulus pattern repeatedly input layer network
incidentally conventional cognitron also ability recognize pattern response dependent upon position stimulus pattern
hierarchy cell 