In [2]:

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import math
import operator
import statistics
from string import punctuation
stop_words = set(stopwords.words('spanish') + list(punctuation))
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [20]:
def getFile(fname):
    try:
        with open(fname, 'r', encoding='utf-8') as f:
            text = f.read()
        
        return text
    except Exception as e:
        print(f"Error reading the file: {e}")
        return None

In [4]:
def read_five_txt_files(file_paths):
  """Reads the contents of five text files and returns a list of strings.
  Args:
    file_paths: A list of five file paths.
  Returns:
    A list of strings, where each string is the contents of one of the text files.
  """
  texts = []
  for file_path in file_paths:
    with open(file_path, "r") as f:
      texts.append(f.read())
  return texts


In [5]:
def remove_string_special_characters(s, convert_to_lower=True):

    # Replace special character with ' '
    stripped = re.sub('[^\w\s]', '', s)
    stripped = re.sub('_', '', stripped)

    # Change any whitespace to one space
    stripped = re.sub('\s+', ' ', stripped)

    # Remove start and end white spaces
    stripped = stripped.strip()

    # Convert to lowercase if specified
    if convert_to_lower:
        stripped = stripped.lower()
    
    return stripped


In [6]:
def count_words(text):
    """This function returns the 
    total number of words in the input text.
    """
    count = 0
    words = word_tokenize(text)
    for word in words:
        count += 1
    return count

In [7]:
def get_doc(text_sents_clean):
    
    doc_info = []
    i = 0
    for sent in text_sents_clean:
        i += 1 
        count = count_words(sent)
        temp = {'doc_id' : i, 'doc_length' : count}
        doc_info.append(temp)
    return doc_info

In [8]:
def create_freq_dict(sents):
    """
    This function creates a frequency dictionary
    of each document that contains words other than
    stop words.
    """
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            #word = word.lower()
            if word not in stop_words:
                if word in freq_dict:
                    freq_dict[word] += 1
                else:
                    freq_dict[word] = 1
                temp = {'doc_id' : i, 'freq_dict': freq_dict}
        freqDict_list.append(temp)
    return freqDict_list

In [9]:
def global_frequency(text_sents_clean):
    """
    This function returns a dictionary with the frequency 
    count of every word in the text
    """
    freq_table = {}
    text = ' '.join(text_sents_clean) #join the cleaned sentences to get the text 
    words = word_tokenize(text)
    for word in words:
        word = word.lower()
        if word not in stop_words:
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
    return freq_table

In [10]:
def get_keywords(text_sents_clean):

    freq_table = global_frequency(text_sents_clean)
    #sort in descending order
    freq_table_sorted = sorted(freq_table.items(), key = operator.itemgetter(1), reverse = True) 
    keywords = []
    for i in range(0, 5):  #taking first 5 most frequent words
        keywords.append(freq_table_sorted[i][0])
    return keywords

In [11]:
def computeTF(doc_info, freqDict_list):
    """
    tf = (frequency of the term in the doc/total number of terms in the doc)
    """
    TF_scores = []
    
    for tempDict in freqDict_list:
        id = tempDict['doc_id']
        for k in tempDict['freq_dict']:
            temp = {'doc_id' : id,
                    'TF_score' : tempDict['freq_dict'][k]/doc_info[id-1]['doc_length'],
                   'key' : k}
            TF_scores.append(temp)
    return TF_scores

In [12]:
def computeIDF(doc_info, freqDict_list):
    """
    idf = ln(total number of docs/number of docs with term in it)
    """
    
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['freq_dict'].keys():
            count = sum([k in tempDict['freq_dict'] for tempDict in freqDict_list])
            temp = {'doc_id' : counter, 'IDF_score' : math.log(len(doc_info)/count), 'key' : k}
    
            IDF_scores.append(temp)
                
    return IDF_scores

In [13]:
def computeTFIDF(TF_scores, IDF_scores):
    """
    TFIDF is computed by multiplying the coressponding
    TF and IDF values of each term. 
    """
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['key'] == i['key'] and j['doc_id'] == i['doc_id']:
                temp = {'doc_id' : i['doc_id'],
                        'TFIDF_score' : j['IDF_score']*i['TF_score'],
                       'key' : i['key']}
        TFIDF_scores.append(temp)
    return TFIDF_scores

In [14]:
def weigh_keywords(TFIDF_scores,text_sents_clean):

    keywords = get_keywords(text_sents_clean)
    for temp_dict in TFIDF_scores:
        if temp_dict['key'] in keywords:
            temp_dict['TFIDF_score'] *= 2
    return TFIDF_scores

In [15]:
def tokenizador(texto):
    oraciones= ['. ','\n']
    tokens = []

    token_actual = ''
    for caracter in texto:
        #if caracter == ' ' or caracter in caracteres:
        if caracter in oraciones:
            if token_actual:
                tokens.append(token_actual)
                token_actual = ''
        else:
            token_actual += caracter

    if token_actual:
        tokens.append(token_actual)

    return tokens

In [16]:
def get_sent_score(TFIDF_scores, text_sents, doc_info):

    sentence_info = []
    for doc in doc_info:

        sent_score = 0
        for i in range(0, len(TFIDF_scores)):
            temp_dict = TFIDF_scores[i]
            if doc['doc_id'] == temp_dict['doc_id']:
                sent_score += temp_dict['TFIDF_score']
        temp = {'doc_id' : doc['doc_id'], 'sent_score' : sent_score,
                'sentence' : text_sents[doc['doc_id']-1]}
        sentence_info.append(temp)

    return sentence_info

In [17]:
def eliminar_puntuacion_y_stopwords(texto):
    # Eliminar signos de puntuación
    texto_sin_puntuacion = "".join([caracter for caracter in texto if caracter not in string.punctuation])
    # Tokenizar el texto
    tokens = texto_sin_puntuacion.split()

    
    # Eliminar stopwords
    stopwords_spanish = set(stopwords.words('spanish'))
    tokens_sin_stopwords = [palabra for palabra in tokens if palabra.lower() not in stopwords_spanish]

    # Unir las palabras nuevamente en una cadena de texto
    texto_procesado = " ".join(tokens_sin_stopwords)

    return texto_procesado

In [18]:
def get_summary(sentence_info):
    sum = 0
    summary = []
    array = []
    for temp_dict in sentence_info:
        """
        This loop gets the sum of scores
        of all the sentences.
        """
        sum += temp_dict['sent_score']
    avg = sum/len(sentence_info) #computing the average tf-idf score
    for temp_dict in sentence_info:
        """
        This loop gets the sentence scores 
        and stores them in an array.
        """
        array.append(temp_dict['sent_score'])
    stdev = statistics.stdev(array) #computing standard deviation on the array   
    for sent in sentence_info:
        """
        This loop is for getting the sumamry by 
        extracting sentences by an if clause
        """
        if(sent['sent_score']) >= avg: # + 1.5*stdev:
            summary.append(sent['sentence'])
    summary = '\n'.join(summary)
    return summary

In [23]:
with open('prueba2_ia.txt', 'r', encoding='utf-8') as file:
    texto = file.read()
oraciones = tokenizador(texto)
#oraciones = sent_tokenize(text)

oraciones_lim = [remove_string_special_characters(s) for s in oraciones]#Se va ingresando por oración 
doc_info = get_doc(oraciones_lim) # Se encarga  de odtener la longitud de cada uno de las oraciones eliminando las Stopwords

freqDict_list = create_freq_dict(oraciones_lim)
TF_scores = computeTF(doc_info, freqDict_list)
IDF_scores = computeIDF(doc_info, freqDict_list)


TFIDF_scores = computeTFIDF(TF_scores, IDF_scores)
TFIDF_scores = weigh_keywords(TFIDF_scores,oraciones_lim)
sentence_info = get_sent_score(TFIDF_scores, oraciones, doc_info)

doc_info = get_doc(oraciones_lim)
summary = get_summary(sentence_info)
print(summary)

Texto I
Texto n
Texto t
Texto e
Texto l
Texto i
Texto g
Texto e
Texto n
Texto c
Texto i
Texto a
Texto  
Texto a
Texto r
Texto t
Texto i
Texto f
Texto i
Texto c
Texto i
Texto a
Texto l
Texto  
Texto g
Texto e
Texto n
Texto e
Texto r
Texto a
Texto l
Texto  
Texto (
Texto A
Texto G
Texto I
Texto )
Texto 

Texto E
Texto s
Texto t
Texto a
Texto  
Texto c
Texto a
Texto t
Texto e
Texto g
Texto o
Texto r
Texto í
Texto a
Texto  
Texto -
Texto A
Texto r
Texto t
Texto i
Texto f
Texto i
Texto c
Texto i
Texto a
Texto l
Texto  
Texto G
Texto e
Texto n
Texto e
Texto r
Texto a
Texto l
Texto  
Texto I
Texto n
Texto t
Texto e
Texto l
Texto l
Texto i
Texto g
Texto e
Texto n
Texto c
Texto e
Texto -
Texto  
Texto s
Texto e
Texto  
Texto a
Texto l
Texto c
Texto a
Texto n
Texto z
Texto a
Texto  
Texto c
Texto u
Texto a
Texto n
Texto d
Texto o
Texto  
Texto u
Texto n
Texto a
Texto  
Texto m
Texto á
Texto q
Texto u
Texto i
Texto n
Texto a
Texto  
Texto a
Texto d
Texto q
Texto u
Texto i
Texto e
Texto r
Texto e
