In [8]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
import operator
import statistics
import math 
import re

stop_words = set(stopwords.words('spanish') + list(punctuation))
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def getFile(fname):
    try:
        with open(fname, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except Exception as e:
        print(f"Error reading the file: {e}")
        return None

        
def clean2(textoTokenizado):
    newTokens = []
    for token in textoTokenizado:
        pattern = r"[^a-zA-Z0-9 \n.,()áéóíúÁÉÍÓÚ]"

        stripped = re.sub(pattern, "", token)

        # Change any whitespace to one space
        stripped = ' '.join(stripped.split())

        # Remove start and end white spaces
        stripped = stripped.strip()
        
        newTokens.append(stripped)
    return newTokens

def clean(textoTokenizado):
    newTokens = []
    for token in textoTokenizado:
        pattern = r"[^a-zA-Z0-9 \n]"

        stripped = re.sub(pattern, "", token)

        # Change any whitespace to one space
        stripped = ' '.join(stripped.split())

        # Remove start and end white spaces
        stripped = stripped.strip()
        
        newTokens.append(stripped)
    return newTokens

def global_frequency(txtClean):

    freq_table = {}
    text = ' '.join(txtClean) #join the cleaned sentences to get the text 
    words = word_tokenize(text)
    for word in words:
        word = word.lower()
        word = ps.stem(word)
        if word not in stop_words:
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
                
    return freq_table


def get_keywords(txtClean, n):

    freq_table = global_frequency(txtClean)
    #sort in descending order
    freq_table_sorted = sorted(freq_table.items(), key = operator.itemgetter(1), reverse = False) 
    #print(freq_table_sorted)
    keywords = []
    for i in range(0, n):  #taking first 5 most frequent words
        keywords.append(freq_table_sorted[i][0])
    return keywords

def weigh_keywords(txtClean, TFIDF_scores, n):

    keywords = get_keywords(txtClean, n)
    for temp_dict in TFIDF_scores:
        if temp_dict['word'] in keywords:
            temp_dict['TFIDF_score'] *= 2
    return TFIDF_scores

def get_sent_score(TFIDF_scores, text_sents, doc_info):

    sentence_info = []
    for doc in doc_info:

        sent_score = 0
        for i in range(0, len(TFIDF_scores)):
            temp_dict = TFIDF_scores[i]
            if doc['sentence_id'] == temp_dict['sentence_id']:
                sent_score += temp_dict['TFIDF_score']
        temp = {'sentence_id' : doc['sentence_id'], 'sent_score' : sent_score,
                'sentence' : text_sents[doc['sentence_id']-1]}
        sentence_info.append(temp)
    return sentence_info

def get_summary(sentence_info):
    sum = 0
    summary = []
    array = []
    for temp_dict in sentence_info:

        sum += temp_dict['sent_score']
    avg = sum/len(sentence_info) #computing the average tf-idf score
    for temp_dict in sentence_info:

        array.append(temp_dict['sent_score'])
    stdev = statistics.stdev(array) #computing standard deviation on the array   
    for sent in sentence_info:

        if(sent['sent_score']) >= avg: # + 1.5*stdev:
            summary.append(sent['sentence'])
    summary = '\n'.join(summary)
    return summary

In [3]:
def frequency(sentences):
    TFIDF_scores = []
    num_words_per_sentence = []
    doc_info = []
    datos = []

    for i, sent in enumerate(sentences, start=1):
        words = sent.split()
        count = len(words)
        temp = {'sentence_id': i, 'num_words': count, 'wordsfreq': {}}

        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word not in stop_words:
                if word in temp['wordsfreq']:
                    temp['wordsfreq'][word] += 1
                else:
                    temp['wordsfreq'][word] = 1

        for word, freq in temp['wordsfreq'].items():
            temp['wordsfreq'][word] = freq / count
        temp2 = {'sentece_id':i, 'word':word}
        doc_info.append(temp)
        num_words_per_sentence.append({'sentence_id': i, 'num_words': count})

    IDF_scores = []
    for word in set(word for doc in doc_info for word in doc['wordsfreq']):
        count = sum(1 for doc in doc_info if word in doc['wordsfreq'])
        idf_score = math.log(len(doc_info) / count)
        IDF_scores.append({'word': word, 'IDF_score': idf_score})

    # Calcular la puntuación TF-IDF para cada palabra en cada oración y almacenar en TFIDF_scores
    for doc in doc_info:
        for word, tf_score in doc['wordsfreq'].items():
            idf_score = next(item['IDF_score'] for item in IDF_scores if item['word'] == word)
            tfidf_score = tf_score * idf_score
            TFIDF_scores.append({'sentence_id': doc['sentence_id'], 'word': word, 'TFIDF_score': tfidf_score})
        

    return TFIDF_scores, num_words_per_sentence


In [9]:
#ps = PorterStemmer()


def resumir(file, n):
    texto = getFile(file)
    for i in range(0, n):
        textoTokenizado = sent_tokenize(texto)
        clean1 = clean2(textoTokenizado)
        txtClean = clean(clean1)
        TFIDF_scores,txtDic = frequency(txtClean)
        TFIDF_scores = weigh_keywords(txtClean, TFIDF_scores, 7)
        sentence_info = get_sent_score(TFIDF_scores, clean1, txtDic)
        #print(sentence_info)
        summary = get_summary(sentence_info)
        print(summary)
        #print(len(texto))
        #print(len(summary))
        texto = summary
resumir('texto.txt', 1)

El futuro de la IA hacia inteligencias artificiales realmente inteligentes Ramón López de Mántaras Instituto de Investigación en Inteligencia Artificial (IIIA), Bellaterra, Espaa Este capítulo contiene algunas reflexiones sobre inteligencia artificial (IA).
Se describen brevemente los principales modelos, insistiendo en la importancia de la corporalidad como aspecto clave para conseguir una IA de naturaleza general.
A continuación se aborda la necesidad de proporcionar a las máquinas conocimientos de sentido común que hagan posible avanzar hacia el ambicioso objetivo de construir IA de tipo general.
También se comentan las últimas tendencias en IA basadas en el análisis de grandes cantidades de datos que han hecho posibles progresos espectaculares en épocas muy recientes, con una alusión a las dificultades presentes hoy en los enfoques de la IA.
Por ejemplo, en el siglo XVII, Descartes se preguntó si un complejo sistema mecánico compuesto de engranajes, poleas y tubos podría, en princi

In [5]:
import re
text = "This is a string with special characters: !@#$%^&*()_+`-=[]{};:'\\|,.<>/?"
# Define a regular expression pattern that matches special characters
pattern = r"[^a-zA-Z0-9 ]"
result = re.sub(pattern, "", text)
print(result)


This is a string with special characters 
