In [109]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
import operator
import statistics
stop_words = set(stopwords.words('spanish') + list(punctuation))
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def getFile(fname):
    try:
        with open(fname, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except Exception as e:
        print(f"Error reading the file: {e}")
        return None

        
def clean(txt):
    textoTokenizado = sent_tokenize(txt)
    newTokens = []
    for token in textoTokenizado:
        special_chars = "!@#$%^&*()-_=+[]{}|;:'\",.<>/?`~"
        
        # Replace special characters with ' '
        stripped = ''.join(char if char.isalnum() or char.isspace() or char in special_chars else ' ' for char in token)

        # Change any whitespace to one space
        stripped = ' '.join(stripped.split())

        # Remove start and end white spaces
        stripped = stripped.strip()
        
        newTokens.append(stripped)
    return newTokens

#unir conteo palabras,, freq palabras y tf oracion 

def countWordSentence(sentences):
    # Se crea un diccionario guardando el numero de palabras por oracion
    doc_info = []
    i = 0
    for sent in sentences:
        i += 1 
        words = sent.split()
        count = len(words)
        temp = {'sentence_id' : i, 'num_words' : count}
        doc_info.append(temp)
    return doc_info

def create_freq_dict(sents):
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word not in stop_words:
                if word in freq_dict:
                    freq_dict[word] += 1
                else:
                    freq_dict[word] = 1
                temp = {'sentence_id' : i, 'wordsfreq': freq_dict}
        freqDict_list.append(temp)
    return freqDict_list


def computeTF(doc_info, freqDict_list):
    TF_scores = []
    
    for tempDict in freqDict_list:
        id = tempDict['sentence_id']
        for k in tempDict['wordsfreq']:
            temp = {'sentence_id' : id,
                    'TF_score' : tempDict['wordsfreq'][k]/doc_info[id-1]['num_words'],
                     'word': k}
            TF_scores.append(temp)
    return TF_scores


import math 
def computeIDF(doc_info, freqDict_list):
    """
    idf = ln(total number of docs/number of docs with term in it)
    """
    
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['wordsfreq'].keys():
            count = sum([k in tempDict['wordsfreq'] for tempDict in freqDict_list])
            temp = {'sentence_id' : counter, 'IDF_score' : math.log(len(doc_info)/count), 'word' : k}
    
            IDF_scores.append(temp)
                
    return IDF_scores

def computeTFIDF(TF_scores, IDF_scores):
    """
    TFIDF is computed by multiplying the coressponding
    TF and IDF values of each term. 
    """
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['word'] == i['word'] and j['sentence_id'] == i['sentence_id']:
                temp = {'sentence_id' : i['sentence_id'],
                        'TFIDF_score' : j['IDF_score']*i['TF_score'],
                       'word' : i['word']}
        TFIDF_scores.append(temp)
    return TFIDF_scores

#se une hasta aqui

def global_frequency(txtClean):
    """
    This function returns a dictionary with the frequency 
    count of every word in the text
    """
    freq_table = {}
    text = ' '.join(txtClean) #join the cleaned sentences to get the text 
    words = word_tokenize(text)
    for word in words:
        word = word.lower()
        word = ps.stem(word)
        if word not in stop_words:
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
                
    return freq_table


def get_keywords(txtClean, n):
    """
    This function gets the top 5 most
    frequently occuring words in the whole text
    and stores them as keywords
    """
    freq_table = global_frequency(txtClean)
    #sort in descending order
    freq_table_sorted = sorted(freq_table.items(), key = operator.itemgetter(1), reverse = False) 
    #print(freq_table_sorted)
    keywords = []
    for i in range(0, n):  #taking first 5 most frequent words
        keywords.append(freq_table_sorted[i][0])
    return keywords

def weigh_keywords(txtClean, TFIDF_scores, n):
    """
    This function doubles the TFIDF score
    of the words that are keywords
    """
    keywords = get_keywords(txtClean, n)
    for temp_dict in TFIDF_scores:
        if temp_dict['word'] in keywords:
            temp_dict['TFIDF_score'] *= 2
    return TFIDF_scores

def get_sent_score(TFIDF_scores, text_sents, doc_info):
    """
    This function prints out the summary and returns the 
    score of each sentence in a list.
    
    The score of a sentence is calculated by adding the TFIDF
    scores of the words that make up the sentence.
    """
    sentence_info = []
    for doc in doc_info:
        """
        This loops through each document(sentence)
        and calculates their 'sent_score'
        """
        sent_score = 0
        for i in range(0, len(TFIDF_scores)):
            temp_dict = TFIDF_scores[i]
            if doc['sentence_id'] == temp_dict['sentence_id']:
                sent_score += temp_dict['TFIDF_score']
        temp = {'sentence_id' : doc['sentence_id'], 'sent_score' : sent_score,
                'sentence' : text_sents[doc['sentence_id']-1]}
        sentence_info.append(temp)
    return sentence_info

def get_summary(sentence_info):
    sum = 0
    summary = []
    array = []
    for temp_dict in sentence_info:
        """
        This loop gets the sum of scores
        of all the sentences.
        """
        sum += temp_dict['sent_score']
    avg = sum/len(sentence_info) #computing the average tf-idf score
    for temp_dict in sentence_info:
        """
        This loop gets the sentence scores 
        and stores them in an array.
        """
        array.append(temp_dict['sent_score'])
    stdev = statistics.stdev(array) #computing standard deviation on the array   
    for sent in sentence_info:
        """
        This loop is for getting the sumamry by 
        extracting sentences by an if clause
        """
        if(sent['sent_score']) >= avg: # + 1.5*stdev:
            summary.append(sent['sentence'])
    summary = '\n'.join(summary)
    return summary

In [127]:
def resumir(file, n):
    texto = getFile(file)
    for i in range(0, n):
        txtClean = clean(texto)
        txtDic = countWordSentence(txtClean)
        freqDict = create_freq_dict(txtClean)
        TF = computeTF(txtDic,freqDict)
        IDF = computeIDF(txtDic,freqDict)
        TFIDF_scores = computeTFIDF(TF, IDF)
        print(TFIDF_scores)
        TFIDF_scores = weigh_keywords(txtClean, TFIDF_scores, 7)
        sentence_info = get_sent_score(TFIDF_scores, txtClean, txtDic)
        print(sentence_info)
        summary = get_summary(sentence_info)
        print(summary)
        print(len(texto))
        print(len(summary))
        texto = summary
resumir('texto.txt', 2)

[{'sentence_id': 1, 'TFIDF_score': 0.12161938431595708, 'word': 'futuro'}, {'sentence_id': 1, 'TFIDF_score': 0.039607735005571534, 'word': 'ia'}, {'sentence_id': 1, 'TFIDF_score': 0.0999585349234588, 'word': 'hacia'}, {'sentence_id': 1, 'TFIDF_score': 0.1588683488226007, 'word': 'inteligencia'}, {'sentence_id': 1, 'TFIDF_score': 0.0999585349234588, 'word': 'artificial'}, {'sentence_id': 1, 'TFIDF_score': 0.12161938431595708, 'word': 'realment'}, {'sentence_id': 1, 'TFIDF_score': 0.049663600159893156, 'word': 'inteligent'}, {'sentence_id': 1, 'TFIDF_score': 0.12161938431595708, 'word': 'ramón'}, {'sentence_id': 1, 'TFIDF_score': 0.12161938431595708, 'word': 'lópez'}, {'sentence_id': 1, 'TFIDF_score': 0.12161938431595708, 'word': 'mántara'}, {'sentence_id': 1, 'TFIDF_score': 0.12161938431595708, 'word': 'instituto'}, {'sentence_id': 1, 'TFIDF_score': 0.12161938431595708, 'word': 'investigación'}, {'sentence_id': 1, 'TFIDF_score': 0.1999170698469176, 'word': 'artifici'}, {'sentence_id': 1