In [109]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
import operator
import statistics
stop_words = set(stopwords.words('spanish') + list(punctuation))
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def getFile(fname):
    try:
        with open(fname, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except Exception as e:
        print(f"Error reading the file: {e}")
        return None

        
def clean(txt):
    textoTokenizado = sent_tokenize(txt)
    newTokens = []
    for token in textoTokenizado:
        special_chars = "!@#$%^&*()-_=+[]{}|;:'\",.<>/?`~"
        
        # Replace special characters with ' '
        stripped = ''.join(char if char.isalnum() or char.isspace() or char in special_chars else ' ' for char in token)

        # Change any whitespace to one space
        stripped = ' '.join(stripped.split())

        # Remove start and end white spaces
        stripped = stripped.strip()
        
        newTokens.append(stripped)
    return newTokens

#unir conteo palabras,, freq palabras y tf oracion 

def countWordSentence(sentences):
    # Se crea un diccionario guardando el numero de palabras por oracion
    doc_info = []
    i = 0
    for sent in sentences:
        i += 1 
        words = sent.split()
        count = len(words)
        temp = {'sentence_id' : i, 'num_words' : count}
        doc_info.append(temp)
    return doc_info

def create_freq_dict(sents):
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word not in stop_words:
                if word in freq_dict:
                    freq_dict[word] += 1
                else:
                    freq_dict[word] = 1
                temp = {'sentence_id' : i, 'wordsfreq': freq_dict}
        freqDict_list.append(temp)
    return freqDict_list


def computeTF(doc_info, freqDict_list):
    TF_scores = []
    
    for tempDict in freqDict_list:
        id = tempDict['sentence_id']
        for k in tempDict['wordsfreq']:
            temp = {'sentence_id' : id,
                    'TF_score' : tempDict['wordsfreq'][k]/doc_info[id-1]['num_words'],
                     'word': k}
            TF_scores.append(temp)
    return TF_scores


import math 
def computeIDF(doc_info, freqDict_list):
    """
    idf = ln(total number of docs/number of docs with term in it)
    """
    
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['wordsfreq'].keys():
            count = sum([k in tempDict['wordsfreq'] for tempDict in freqDict_list])
            temp = {'sentence_id' : counter, 'IDF_score' : math.log(len(doc_info)/count), 'word' : k}
    
            IDF_scores.append(temp)
                
    return IDF_scores

def computeTFIDF(TF_scores, IDF_scores):
    """
    TFIDF is computed by multiplying the coressponding
    TF and IDF values of each term. 
    """
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['word'] == i['word'] and j['sentence_id'] == i['sentence_id']:
                temp = {'sentence_id' : i['sentence_id'],
                        'TFIDF_score' : j['IDF_score']*i['TF_score'],
                       'word' : i['word']}
        TFIDF_scores.append(temp)
    return TFIDF_scores

#se une hasta aqui

def global_frequency(txtClean):
    """
    This function returns a dictionary with the frequency 
    count of every word in the text
    """
    freq_table = {}
    text = ' '.join(txtClean) #join the cleaned sentences to get the text 
    words = word_tokenize(text)
    for word in words:
        word = word.lower()
        word = ps.stem(word)
        if word not in stop_words:
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
                
    return freq_table


def get_keywords(txtClean, n):
    """
    This function gets the top 5 most
    frequently occuring words in the whole text
    and stores them as keywords
    """
    freq_table = global_frequency(txtClean)
    #sort in descending order
    freq_table_sorted = sorted(freq_table.items(), key = operator.itemgetter(1), reverse = False) 
    #print(freq_table_sorted)
    keywords = []
    for i in range(0, n):  #taking first 5 most frequent words
        keywords.append(freq_table_sorted[i][0])
    return keywords

def weigh_keywords(txtClean, TFIDF_scores, n):
    """
    This function doubles the TFIDF score
    of the words that are keywords
    """
    keywords = get_keywords(txtClean, n)
    for temp_dict in TFIDF_scores:
        if temp_dict['word'] in keywords:
            temp_dict['TFIDF_score'] *= 2
    return TFIDF_scores

def get_sent_score(TFIDF_scores, text_sents, doc_info):
    """
    This function prints out the summary and returns the 
    score of each sentence in a list.
    
    The score of a sentence is calculated by adding the TFIDF
    scores of the words that make up the sentence.
    """
    sentence_info = []
    for doc in doc_info:
        """
        This loops through each document(sentence)
        and calculates their 'sent_score'
        """
        sent_score = 0
        for i in range(0, len(TFIDF_scores)):
            temp_dict = TFIDF_scores[i]
            if doc['sentence_id'] == temp_dict['sentence_id']:
                sent_score += temp_dict['TFIDF_score']
        temp = {'sentence_id' : doc['sentence_id'], 'sent_score' : sent_score,
                'sentence' : text_sents[doc['sentence_id']-1]}
        sentence_info.append(temp)
    return sentence_info

def get_summary(sentence_info):
    sum = 0
    summary = []
    array = []
    for temp_dict in sentence_info:
        """
        This loop gets the sum of scores
        of all the sentences.
        """
        sum += temp_dict['sent_score']
    avg = sum/len(sentence_info) #computing the average tf-idf score
    for temp_dict in sentence_info:
        """
        This loop gets the sentence scores 
        and stores them in an array.
        """
        array.append(temp_dict['sent_score'])
    stdev = statistics.stdev(array) #computing standard deviation on the array   
    for sent in sentence_info:
        """
        This loop is for getting the sumamry by 
        extracting sentences by an if clause
        """
        if(sent['sent_score']) >= avg: # + 1.5*stdev:
            summary.append(sent['sentence'])
    summary = '\n'.join(summary)
    return summary

In [148]:
def resumir(file, n):
    texto = getFile(file)
    for i in range(0, n):
        txtClean = clean(texto)
        txtDic = countWordSentence(txtClean)
        freqDict = create_freq_dict(txtClean)
        TF = computeTF(txtDic,freqDict)
        IDF = computeIDF(txtDic,freqDict)
        TFIDF_scores = computeTFIDF(TF, IDF)
        print(TFIDF_scores)
        TFIDF_scores = weigh_keywords(txtClean, TFIDF_scores, 7)
        sentence_info = get_sent_score(TFIDF_scores, txtClean, txtDic)
        print(sentence_info)
        summary = get_summary(sentence_info)
        print(summary)
        print(len(texto))
        print(len(summary))
        texto = summary
    return 
resumir('texto.txt', 1)

NameError: name 'countWordSentence' is not defined

In [150]:
import pandas as pd

def resumir(file, n):
    texto = getFile(file)
    for i in range(0, n):
        txtClean = clean(texto)
        txtDic = countWordSentence(txtClean)
        freqDict = create_freq_dict(txtClean)
        TF = computeTF(txtDic,freqDict)
        IDF = computeIDF(txtDic,freqDict)
        TFIDF_scores = computeTFIDF(TF, IDF)
        print(TFIDF_scores)
        TFIDF_scores = weigh_keywords(txtClean, TFIDF_scores, 7)
        sentence_info = get_sent_score(TFIDF_scores, txtClean, txtDic)
        print(sentence_info)
        summary = get_summary(sentence_info)
        print(summary)
        print(len(texto))
        print(len(summary))
        texto = summary
    return 
resumir('texto.txt', 1)

# Diccionarios
diccionario1 = {'futuro': 1, 'ia': 2, 'hacia': 1, 'inteligencia': 3, 'artificial': 1, 'realment': 1, 'inteligent': 1, 'ramn': 1, 'lpez': 1, 'mntara': 1, 'instituto': 1, 'investigacin': 1, 'artifici': 2, 'iiia': 1, 'bellaterra': 1, 'espaa': 1, 'est': 1, 'captulo': 1, 'contien': 1, 'alguna': 1, 'reflexion': 1, 'sobr': 1}
diccionario2 = {'futuro': 0.03125, 'ia': 0.0625, 'hacia': 0.03125, 'inteligencia': 0.09375, 'artificial': 0.03125, 'realment': 0.03125, 'inteligent': 0.03125, 'ramn': 0.03125, 'lpez': 0.03125, 'mntara': 0.03125, 'instituto': 0.03125, 'investigacin': 0.03125, 'artifici': 0.0625, 'iiia': 0.03125, 'bellaterra': 0.03125, 'espaa': 0.03125, 'est': 0.03125, 'captulo': 0.03125, 'contien': 0.03125, 'alguna': 0.03125, 'reflexion': 0.03125, 'sobr': 0.03125}
diccionario3 = {'futuro': 3.912023005428146, 'ia': 3.8926049195710446, 'hacia': 3.901972669574645, 'inteligencia': 3.8940853047414787, 'artificial': 3.901972669574645, 'realment': 3.912023005428146, 'inteligent': 3.893859034800475, 'ramn': 3.912023005428146, 'lpez': 3.912023005428146, 'mntara': 3.912023005428146, 'instituto': 3.912023005428146, 'investigacin': 3.912023005428146, 'artifici': 3.901972669574645, 'iiia': 3.912023005428146, 'bellaterra': 3.912023005428146, 'espaa': 3.912023005428146, 'est': 3.8943680701894254, 'captulo': 3.912023005428146, 'contien': 3.912023005428146, 'alguna': 3.912023005428146, 'reflexion': 3.912023005428146, 'sobr': 3.8958936234982624}
diccionario4 = {'futuro': 0.24450143783925912, 'ia': 0.2432878074731903, 'hacia': 0.12193664592420765, 'inteligencia': 0.36507049731951363, 'artificial': 0.12193664592420765, 'realment': 0.24450143783925912, 'inteligent': 0.12168309483751484, 'ramn': 0.24450143783925912, 'lpez': 0.24450143783925912, 'mntara': 0.24450143783925912, 'instituto': 0.24450143783925912, 'investigacin': 0.24450143783925912, 'artifici': 0.2438732918484153, 'iiia': 0.12225071891962956, 'bellaterra': 0.12225071891962956, 'espaa': 0.12225071891962956, 'est': 0.12169900219341954, 'captulo': 0.12225071891962956, 'contien': 0.12225071891962956, 'alguna': 0.12225071891962956, 'reflexion': 0.12225071891962956, 'sobr': 0.1217466757343207}




         Palabra  Frecuencia1  Frecuencia2  Frecuencia3  Frecuencia4
0         futuro            1      0.03125     3.912023     0.244501
1             ia            2      0.06250     3.892605     0.243288
2          hacia            1      0.03125     3.901973     0.121937
3   inteligencia            3      0.09375     3.894085     0.365070
4     artificial            1      0.03125     3.901973     0.121937
5       realment            1      0.03125     3.912023     0.244501
6     inteligent            1      0.03125     3.893859     0.121683
7           ramn            1      0.03125     3.912023     0.244501
8           lpez            1      0.03125     3.912023     0.244501
9         mntara            1      0.03125     3.912023     0.244501
10     instituto            1      0.03125     3.912023     0.244501
11  investigacin            1      0.03125     3.912023     0.244501
12      artifici            2      0.06250     3.901973     0.243873
13          iiia            1     