In [2]:
fileList = ['25.txt','25a.txt','26.txt','26a.txt','27.txt','27a.txt','28.txt','28a.txt']

In [6]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
import string
from collections import defaultdict

def preprocess_text(text):
    # Tokenize text into sentences and words
    sentences = sent_tokenize(text)
    words = [word.lower() for sentence in sentences for word in word_tokenize(sentence)]

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words and word not in string.punctuation]

    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return words

def create_graph(words):
    # Build word graph using co-occurrence within a window of words
    graph = defaultdict(lambda: defaultdict(int))
    window_size = 2

    for i in range(len(words) - window_size):
        for j in range(1, window_size + 1):
            if words[i] != words[i + j]:
                graph[words[i]][words[i + j]] += 1
                graph[words[i + j]][words[i]] += 1

    return graph

def calculate_page_rank(graph, damping=0.85, max_iterations=100, tolerance=1e-6):
    # Initialize page rank scores
    page_rank = defaultdict(float)
    node_list = list(graph.keys())
    num_nodes = len(node_list)
    initial_value = 1 / num_nodes

    for node in node_list:
        page_rank[node] = initial_value

    # Calculate page rank iteratively
    for _ in range(max_iterations):
        new_page_rank = defaultdict(float)
        total_diff = 0

        for node in node_list:
            new_rank = (1 - damping) / num_nodes
            for neighbor in graph[node]:
                new_rank += damping * (page_rank[neighbor] / sum(graph[neighbor].values()))
            new_page_rank[node] = new_rank

        # Check convergence
        for node in node_list:
            total_diff += abs(new_page_rank[node] - page_rank[node])
        if total_diff < tolerance:
            break

        page_rank = new_page_rank

    return page_rank

def extract_top_keywords(page_rank, n=10):
    # Get top n keywords based on page rank scores
    sorted_page_rank = sorted(page_rank.items(), key=lambda x: x[1], reverse=True)
    top_keywords = [keyword for keyword, score in sorted_page_rank[:n]]
    return top_keywords

def textrank_keyword_extraction(text_path):
    # Read text from file
    with open(text_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Preprocess text
    words = preprocess_text(text)

    # Build word graph
    graph = create_graph(words)

    # Calculate page rank scores
    page_rank = calculate_page_rank(graph)

    # Extract top keywords
    top_keywords = extract_top_keywords(page_rank)

    return top_keywords
nltk.download('punkt')
nltk.download('stopwords')
# Example usage
text_file_path = "keywordExtraction/" + fileList[0]
top_keywords = textrank_keyword_extraction(text_file_path)
print("Top Keywords:", top_keywords)


Top Keywords: ['de', 'în', 'dreptului', 'şi', 'ca', 'est', 'nu', 'care', 'juridică', 'juridic']


[nltk_data] Downloading package punkt to /home/cosmin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/cosmin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
import spacy
import pytextrank

def getTexts(fileList):
    texts = []
    for file in fileList:
        with open("keywordExtraction/" + file, "r", encoding="utf-8") as f:
            texts.append(f.read())
    return texts


texts = getTexts(fileList)

nlp = spacy.load("en_core_web_sm")
nlpro = spacy.load("ro_core_news_sm")
nlp.add_pipe("textrank")
nlpro.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x7f97415b63a0>

In [20]:
for text in texts:
    doc = nlpro(text)
    for phrase in doc._.phrases[:10]:
        print(phrase.text)
    print('------------------')

Dreptul
răspunderii
Responsabilitatea socială
ramură
autorului
oameni
------------------
Operei Române
Compania Opera Română
fondatorul Operei Române
Opera Națională București
Muzeul Operei
profesorul român
română
Clădirea
clădirii
mari
------------------
prognosticul sever
Hiv
copii
copilului
acelaşi grad de Hiv
770-2240
Adams-Chapman
Autorul
Hemoragia
Patra
------------------
Campionatul Mondial
Campionatului Mondial
campionatul mondial
Campionat Mondial FIFA
Campionat Mondial
Statele Unite
Uruguay
Estadio Centenario
Uruguayului
Franța
------------------
om
omului
V. Buhvalov
Pascali
autorul
laborator
Dediu
1996
2002
2006
------------------
Galaxia Andromeda
Andromeda
Calea Lactee
Astronomul Charles Messier
Căii Lactee
obiectele Messier
Catalogul Messier
Grupul Local
astronomului
Sistemului Solar
------------------
Richard Bach
imagini
Oamenii
scrie Gabriel Albu
„Conştiinţa
mereu
------------------
motorul Otto
Pistonul
pistonul
motorul
arborelui cotit
timpi separați
încărcăturii
tim

In [22]:
import textrank

textrank.extract_key_phrases(texts[0])

{'Abordând',
 'Disciplinele',
 'Elementele constitutive',
 'Responsabilitatea',
 'autorului',
 'caracteristică',
 'categoria',
 'cercetările',
 'conceptul responsabilităţii',
 'condiţiilor răspunderii',
 'consecinţele nerespectării',
 'considerat',
 'conştiinţa destinatarilor',
 'determinat',
 'dezaprobării',
 'dreptului',
 'existând',
 'indiferent',
 'interesul',
 'manifestare',
 'manifestă',
 'necesitatea conturării conceptului',
 'obligaţie',
 'obligaţii',
 'parlamentului',
 'patrimoniu',
 'periculoasă',
 'persoanei',
 'principal',
 'principiu',
 'protectiv-represiv',
 'reducţionistă',
 'responsabilitate',
 'responsabilitatea',
 'responsabilitatea religioasă',
 'răspunderea',
 'răspunderea administrativă',
 'răspunderea constituţională',
 'răspunderea disciplinară',
 'răspunderii',
 'scopurile',
 'sentimentul responsabilităţii',
 'sistemului',
 'specifică',
 'specifică dreptului',
 'stătătoare',
 'subiective',
 'subiectul',
 'săvârşit',
 'tradiţional',
 'întotdeauna',
 'întrunind co

In [35]:
# clean up script
prefixList = ['25','25a','26','26a','27','27a','28','28a']
# for each file of type prefixList(i)_Voinopol.txt, load the words in a list and only keep the first 10; the format of the file is { 'word', 'word', 'word', ... }
for prefix in prefixList:
    with open(prefix + "_Voinopol.txt", "r", encoding="utf-8") as f:
        
        # words = f.read().split(', ')
        # transform text from { 'word', 'word', 'word', ... } to ['word', 'word', 'word', ...]
        words = f.read()[1:-1].split(', ')  
        # remove '
        words = [word[1:-1] for word in words]
        # print in format [ "word" ], [ "word" ], [ "word" ], ...
        with open(prefix + "_Voinopol_cleaned.txt", "w", encoding="utf-8") as f:
            for word in words[:10]:
                f.write('\n[ \n"' + word + '" \n],')

    # print('------------------')