In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
import operator
import statistics
import math 
import re

stop_words = set(stopwords.words('spanish') + list(punctuation))
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def getFile(fname):
    try:
        with open(fname, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except Exception as e:
        print(f"Error reading the file: {e}")
        return None

        
def clean(txt):
    textoTokenizado = sent_tokenize(txt)
    newTokens = []
    for token in textoTokenizado:
        pattern = r"[^a-zA-Z0-9 \n]"

        stripped = re.sub(pattern, "", token)

        # Change any whitespace to one space
        stripped = ' '.join(stripped.split())

        # Remove start and end white spaces
        stripped = stripped.strip()
        
        newTokens.append(stripped)
    return newTokens

#unir conteo palabras,, freq palabras y tf oracion 

def countWordSentence(sentences):
    # Se crea un diccionario guardando el numero de palabras por oracion
    doc_info = []
    i = 0
    for sent in sentences:
        i += 1 
        words = sent.split()
        count = len(words)
        temp = {'sentence_id' : i, 'num_words' : count}
        doc_info.append(temp)
    return doc_info

def create_freq_dict(sents):
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word not in stop_words:
                if word in freq_dict:
                    freq_dict[word] += 1
                else:
                    freq_dict[word] = 1
                temp = {'sentence_id' : i, 'wordsfreq': freq_dict}
        freqDict_list.append(temp)
    return freqDict_list


def computeTF(doc_info, freqDict_list):
    TF_scores = []
    
    for tempDict in freqDict_list:
        id = tempDict['sentence_id']
        for k in tempDict['wordsfreq']:
            temp = {'sentence_id' : id,
                    'TF_score' : tempDict['wordsfreq'][k]/doc_info[id-1]['num_words'],
                     'word': k}
            TF_scores.append(temp)
    return TF_scores



def computeIDF(doc_info, freqDict_list):
    """
    idf = ln(total number of docs/number of docs with term in it)
    """
    
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['wordsfreq'].keys():
            count = sum([k in tempDict['wordsfreq'] for tempDict in freqDict_list])
            temp = {'sentence_id' : counter, 'IDF_score' : math.log(len(doc_info)/count), 'word' : k}
    
            IDF_scores.append(temp)
                
    return IDF_scores

def computeTFIDF(TF_scores, IDF_scores):
    """
    TFIDF is computed by multiplying the coressponding
    TF and IDF values of each term. 
    """
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['word'] == i['word'] and j['sentence_id'] == i['sentence_id']:
                temp = {'sentence_id' : i['sentence_id'],
                        'TFIDF_score' : j['IDF_score']*i['TF_score'],
                       'word' : i['word']}
        TFIDF_scores.append(temp)
    return TFIDF_scores

#se une hasta aqui

def global_frequency(txtClean):
    """
    This function returns a dictionary with the frequency 
    count of every word in the text
    """
    freq_table = {}
    text = ' '.join(txtClean) #join the cleaned sentences to get the text 
    words = word_tokenize(text)
    for word in words:
        word = word.lower()
        word = ps.stem(word)
        if word not in stop_words:
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
                
    return freq_table


def get_keywords(txtClean, n):
    """
    This function gets the top 5 most
    frequently occuring words in the whole text
    and stores them as keywords
    """
    freq_table = global_frequency(txtClean)
    #sort in descending order
    freq_table_sorted = sorted(freq_table.items(), key = operator.itemgetter(1), reverse = False) 
    #print(freq_table_sorted)
    keywords = []
    for i in range(0, n):  #taking first 5 most frequent words
        keywords.append(freq_table_sorted[i][0])
    return keywords

def weigh_keywords(txtClean, TFIDF_scores, n):
    """
    This function doubles the TFIDF score
    of the words that are keywords
    """
    keywords = get_keywords(txtClean, n)
    for temp_dict in TFIDF_scores:
        if temp_dict['word'] in keywords:
            temp_dict['TFIDF_score'] *= 2
    return TFIDF_scores

def get_sent_score(TFIDF_scores, text_sents, doc_info):
    """
    This function prints out the summary and returns the 
    score of each sentence in a list.
    
    The score of a sentence is calculated by adding the TFIDF
    scores of the words that make up the sentence.
    """
    sentence_info = []
    for doc in doc_info:
        """
        This loops through each document(sentence)
        and calculates their 'sent_score'
        """
        sent_score = 0
        for i in range(0, len(TFIDF_scores)):
            temp_dict = TFIDF_scores[i]
            if doc['sentence_id'] == temp_dict['sentence_id']:
                sent_score += temp_dict['TFIDF_score']
        temp = {'sentence_id' : doc['sentence_id'], 'sent_score' : sent_score,
                'sentence' : text_sents[doc['sentence_id']-1]}
        sentence_info.append(temp)
    return sentence_info

def get_summary(sentence_info):
    sum = 0
    summary = []
    array = []
    for temp_dict in sentence_info:
        """
        This loop gets the sum of scores
        of all the sentences.
        """
        sum += temp_dict['sent_score']
    avg = sum/len(sentence_info) #computing the average tf-idf score
    for temp_dict in sentence_info:
        """
        This loop gets the sentence scores 
        and stores them in an array.
        """
        array.append(temp_dict['sent_score'])
    stdev = statistics.stdev(array) #computing standard deviation on the array   
    for sent in sentence_info:
        """
        This loop is for getting the sumamry by 
        extracting sentences by an if clause
        """
        if(sent['sent_score']) >= avg: # + 1.5*stdev:
            summary.append(sent['sentence'])
    summary = '\n'.join(summary)
    return summary

In [3]:
def resumir(file, n):
    texto = getFile(file)
    for i in range(0, n):
        txtClean = clean(texto)
        txtDic = countWordSentence(txtClean)
        freqDict = create_freq_dict(txtClean)
        TF = computeTF(txtDic,freqDict)
        IDF = computeIDF(txtDic,freqDict)
        TFIDF_scores = computeTFIDF(TF, IDF)
        print(TFIDF_scores)
        TFIDF_scores = weigh_keywords(txtClean, TFIDF_scores, 7)
        sentence_info = get_sent_score(TFIDF_scores, txtClean, txtDic)
        print(sentence_info)
        summary = get_summary(sentence_info)
        print(summary)
        print(len(texto))
        print(len(summary))
        texto = summary
resumir('ia.txt', 2)

[{'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'actual'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'sociedad'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'cambiant'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'sector'}, {'sentence_id': 1, 'TFIDF_score': 0.040689344024744806, 'word': 'distribución'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'enfrenta'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'numeroso'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'desafío'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'presion'}, {'sentence_id': 1, 'TFIDF_score': 0.040689344024744806, 'word': 'competitiva'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'asociada'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'lucha'}, {'sentence_id': 1, 'TFIDF_score': 0.08137868804948961, 'word': 'manten'}, {'sentence_id

In [5]:
ps = PorterStemmer()

def frequency(sentences):
    TFIDF_scores = []
    num_words_per_sentence = []
    doc_info = []

    for i, sent in enumerate(sentences, start=1):
        words = sent.split()
        count = len(words)
        temp = {'sentence_id': i, 'num_words': count, 'wordsfreq': {}}

        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word not in stop_words:
                if word in temp['wordsfreq']:
                    temp['wordsfreq'][word] += 1
                else:
                    temp['wordsfreq'][word] = 1

        for word, freq in temp['wordsfreq'].items():
            temp['wordsfreq'][word] = freq / count

        doc_info.append(temp)
        num_words_per_sentence.append({'sentence_id': i, 'num_words': count})

    IDF_scores = []
    for word in set(word for doc in doc_info for word in doc['wordsfreq']):
        count = sum(1 for doc in doc_info if word in doc['wordsfreq'])
        idf_score = math.log(len(doc_info) / count)
        IDF_scores.append({'word': word, 'IDF_score': idf_score})

    # Calcular la puntuación TF-IDF para cada palabra en cada oración y almacenar en TFIDF_scores
    for doc in doc_info:
        for word, tf_score in doc['wordsfreq'].items():
            idf_score = next(item['IDF_score'] for item in IDF_scores if item['word'] == word)
            tfidf_score = tf_score * idf_score
            TFIDF_scores.append({'sentence_id': doc['sentence_id'], 'word': word, 'TFIDF_score': tfidf_score})

    return TFIDF_scores, num_words_per_sentence


def resumir(file, n):
    texto = getFile(file)
    for i in range(0, n):
        txtClean = clean(texto)
        TFIDF_scores,txtDic = frequency(txtClean)
        TFIDF_scores = weigh_keywords(txtClean, TFIDF_scores, 7)
        sentence_info = get_sent_score(TFIDF_scores, txtClean, txtDic)
        #print(sentence_info)
        summary = get_summary(sentence_info)
        print(summary)
        #print(len(texto))
        #print(len(summary))
        texto = summary
resumir('texto.txt', 1)

El futuro de la IA hacia inteligencias artificiales realmente inteligentes Ramn Lpez de Mntaras Instituto de Investigacin en Inteligencia Artificial IIIA Bellaterra Espaa Este captulo contiene algunas reflexiones sobre inteligencia artificial IA
Se describen brevemente los principales modelos insistiendo en la importancia de la corporalidad como aspecto clave para conseguir una IA de naturaleza general
A continuacin se aborda la necesidad de proporcionar a las mquinas conocimientos de sentido comn que hagan posible avanzar hacia el ambicioso objetivo de construir IA de tipo general
Tambin se comentan las ltimas tendencias en IA basadas en el anlisis de grandes cantidades de datos que han hecho posibles progresos espectaculares en pocas muy recientes con una alusin a las dificultades presentes hoy en los enfoques de la IA
Por ejemplo en el siglo XVII Descartes se pregunt si un complejo sistema mecnico compuesto de engranajes poleas y tubos podra en principio emular el pensamiento
Dos si

In [1]:
import re
text = "This is a string with special characters: !@#$%^&*()_+`-=[]{};:'\\|,.<>/?"
# Define a regular expression pattern that matches special characters
pattern = r"[^a-zA-Z0-9 ]"
result = re.sub(pattern, "", text)
print(result)


This is a string with special characters 
