In [79]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import math
import operator
import statistics
from string import punctuation
stop_words = set(stopwords.words('spanish') + list(punctuation))

In [80]:
def get_text_from_file(fname):
    """
    Get file from text doc
    """
    f=open(fname,'r')
    text=f.readlines()
    text=''.join(text) #converting the list to type str
    return text

In [81]:
def read_five_txt_files(file_paths):
  """Reads the contents of five text files and returns a list of strings.
  Args:
    file_paths: A list of five file paths.
  Returns:
    A list of strings, where each string is the contents of one of the text files.
  """
  texts = []
  for file_path in file_paths:
    with open(file_path, "r") as f:
      texts.append(f.read())
  return texts


In [82]:
def remove_string_special_characters(s, convert_to_lower=True):
    """
    This function removes special characters from within a string.
    parameters: 
        s(str): single input string.
    return: 
        stripped(str): A string with special characters removed.
    """

    # Replace special character with ' '
    stripped = re.sub('[^\w\s]', '', s)
    stripped = re.sub('_', '', stripped)

    # Change any whitespace to one space
    stripped = re.sub('\s+', ' ', stripped)
    

    # Remove start and end white spaces
    stripped = stripped.strip()
    if convert_to_lower:
        stripped = stripped.lower()
    
    return stripped

In [83]:
def count_words(text):
    """This function returns the 
    total number of words in the input text.
    """
    count = 0
    words = word_tokenize(text)
    for word in words:
        count += 1
    return count

In [84]:
def get_doc(text_sents_clean):
    """
    this function splits the text into sentences and
    considering each sentence as a document, calculates the 
    total word count of each.
    """
    doc_info = []
    i = 0
    for sent in text_sents_clean:
        i += 1 
        count = count_words(sent)
        temp = {'doc_id' : i, 'doc_length' : count}
        doc_info.append(temp)
    return doc_info

In [85]:
def create_freq_dict(sents):
    """
    This function creates a frequency dictionary
    of each document that contains words other than
    stop words.
    """
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            if word not in stop_words:
                if word in freq_dict:
                    freq_dict[word] += 1
                else:
                    freq_dict[word] = 1
                temp = {'doc_id' : i, 'freq_dict': freq_dict}
        freqDict_list.append(temp)
    return freqDict_list

In [86]:
def global_frequency(text_sents_clean):
    """
    This function returns a dictionary with the frequency 
    count of every word in the text
    """
    freq_table = {}
    text = ' '.join(text_sents_clean) #join the cleaned sentences to get the text 
    words = word_tokenize(text)
    for word in words:
        word = word.lower()
        if word not in stop_words:
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
    return freq_table

In [87]:
def get_keywords(text_sents_clean):
    """
    This function gets the top 5 most
    frequently occuring words in the whole text
    and stores them as keywords
    """
    freq_table = global_frequency(text_sents_clean)
    #sort in descending order
    freq_table_sorted = sorted(freq_table.items(), key = operator.itemgetter(1), reverse = True) 
    keywords = []
    for i in range(0, 5):  #taking first 5 most frequent words
        keywords.append(freq_table_sorted[i][0])
    return keywords

In [88]:
def computeTF(doc_info, freqDict_list):
    """
    tf = (frequency of the term in the doc/total number of terms in the doc)
    """
    TF_scores = []
    
    for tempDict in freqDict_list:
        id = tempDict['doc_id']
        for k in tempDict['freq_dict']:
            temp = {'doc_id' : id,
                    'TF_score' : tempDict['freq_dict'][k]/doc_info[id-1]['doc_length'],
                   'key' : k}
            TF_scores.append(temp)
    return TF_scores

In [89]:
def computeIDF(doc_info, freqDict_list):
    """
    idf = ln(total number of docs/number of docs with term in it)
    """
    
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['freq_dict'].keys():
            count = sum([k in tempDict['freq_dict'] for tempDict in freqDict_list])
            temp = {'doc_id' : counter, 'IDF_score' : math.log(len(doc_info)/count), 'key' : k}
    
            IDF_scores.append(temp)
                
    return IDF_scores

In [90]:
def computeTFIDF(TF_scores, IDF_scores):
    """
    TFIDF is computed by multiplying the coressponding
    TF and IDF values of each term. 
    """
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['key'] == i['key'] and j['doc_id'] == i['doc_id']:
                temp = {'doc_id' : i['doc_id'],
                        'TFIDF_score' : j['IDF_score']*i['TF_score'],
                       'key' : i['key']}
        TFIDF_scores.append(temp)
    return TFIDF_scores

In [91]:
def weigh_keywords(TFIDF_scores):
    """
    This function doubles the TFIDF score
    of the words that are keywords
    """
    keywords = get_keywords(text_sents_clean)
    for temp_dict in TFIDF_scores:
        if temp_dict['key'] in keywords:
            temp_dict['TFIDF_score'] *= 2
    return TFIDF_scores

In [104]:
def tokenizador(texto):
    oraciones= ['. ','\n']
    tokens = []

    token_actual = ''
    for caracter in texto:
        #if caracter == ' ' or caracter in caracteres:
        if caracter in oraciones:
            if token_actual:
                tokens.append(token_actual)
                token_actual = ''
        else:
            token_actual += caracter

    if token_actual:
        tokens.append(token_actual)

    return tokens

In [93]:
def get_sent_score(TFIDF_scores, text_sents, doc_info):
    """
    This function prints out the summary and returns the 
    score of each sentence in a list.
    
    The score of a sentence is calculated by adding the TFIDF
    scores of the words that make up the sentence.
    """
    sentence_info = []
    for doc in doc_info:
        """
        This loops through each document(sentence)
        and calculates their 'sent_score'
        """
        sent_score = 0
        for i in range(0, len(TFIDF_scores)):
            temp_dict = TFIDF_scores[i]
            if doc['doc_id'] == temp_dict['doc_id']:
                sent_score += temp_dict['TFIDF_score']
        temp = {'doc_id' : doc['doc_id'], 'sent_score' : sent_score,
                'sentence' : text_sents[doc['doc_id']-1]}
        sentence_info.append(temp)

    return sentence_info

In [94]:
def eliminar_puntuacion_y_stopwords(texto):
    # Eliminar signos de puntuación
    texto_sin_puntuacion = "".join([caracter for caracter in texto if caracter not in string.punctuation])
    # Tokenizar el texto
    tokens = texto_sin_puntuacion.split()

    
    # Eliminar stopwords
    stopwords_spanish = set(stopwords.words('spanish'))
    tokens_sin_stopwords = [palabra for palabra in tokens if palabra.lower() not in stopwords_spanish]

    # Unir las palabras nuevamente en una cadena de texto
    texto_procesado = " ".join(tokens_sin_stopwords)

    return texto_procesado

In [95]:
def get_summary(sentence_info):
    sum = 0
    summary = []
    array = []
    for temp_dict in sentence_info:
        """
        This loop gets the sum of scores
        of all the sentences.
        """
        sum += temp_dict['sent_score']
    avg = sum/len(sentence_info) #computing the average tf-idf score
    for temp_dict in sentence_info:
        """
        This loop gets the sentence scores 
        and stores them in an array.
        """
        array.append(temp_dict['sent_score'])
    stdev = statistics.stdev(array) #computing standard deviation on the array   
    for sent in sentence_info:
        """
        This loop is for getting the sumamry by 
        extracting sentences by an if clause
        """
        if(sent['sent_score']) >= avg: # + 1.5*stdev:
            summary.append(sent['sentence'])
    summary = '\n'.join(summary)
    return summary

In [111]:
with open('prueba2_ia.txt', 'r', encoding='utf-8') as file:
    texto = file.read()

oraciones = tokenizador(texto)
#oraciones = sent_tokenize(text)

oraciones_lim = [remove_string_special_characters(s) for s in oraciones]#Se va ingresando por oración 
#for i in range(len(oraciones_lim)):
#    print("Sentence {}: {}".format(i+1, oraciones_lim[i]))

doc_info = get_doc(oraciones_lim) # Se encarga  de odtener la longitud de cada uno de las oraciones eliminando las Stopwords
#for i in range(len(oraciones_lim)):
#    print("Sentence {}: {}".format(i+1, doc_info[i]))

freqDict_list = create_freq_dict(oraciones_lim)
#for i in range(len(oraciones_lim)):
#    print("Sentence {}: {}".format(i+1, freqDict_list[i]))
TF_scores = computeTF(doc_info, freqDict_list)
#for i in range(len(TF_scores)):
#    print("Sentence {}: {}".format(i+1, TF_scores[i]))
IDF_scores = computeIDF(doc_info, freqDict_list)
for i in range(len(IDF_scores)):
    print("Sentence {}: {}".format(i+1, IDF_scores[i]))

TFIDF_scores = computeTFIDF(TF_scores, IDF_scores)
TFIDF_scores = weigh_keywords(TFIDF_scores)
sentence_info = get_sent_score(TFIDF_scores, oraciones, doc_info)

doc_info = get_doc(oraciones_lim)
summary = get_summary(sentence_info)
print(summary)

Sentence 1: {'doc_id': 1, 'IDF_score': 1.6094379124341003, 'key': 'inteligencia'}
Sentence 2: {'doc_id': 1, 'IDF_score': 1.6094379124341003, 'key': 'artificial'}
Sentence 3: {'doc_id': 1, 'IDF_score': 2.0149030205422647, 'key': 'general'}
Sentence 4: {'doc_id': 1, 'IDF_score': 2.70805020110221, 'key': 'agi'}
Sentence 5: {'doc_id': 2, 'IDF_score': 2.70805020110221, 'key': 'categoría'}
Sentence 6: {'doc_id': 2, 'IDF_score': 1.6094379124341003, 'key': 'artificial'}
Sentence 7: {'doc_id': 2, 'IDF_score': 2.0149030205422647, 'key': 'general'}
Sentence 8: {'doc_id': 2, 'IDF_score': 2.70805020110221, 'key': 'intelligence'}
Sentence 9: {'doc_id': 2, 'IDF_score': 2.70805020110221, 'key': 'alcanza'}
Sentence 10: {'doc_id': 2, 'IDF_score': 2.70805020110221, 'key': 'máquina'}
Sentence 11: {'doc_id': 2, 'IDF_score': 2.70805020110221, 'key': 'adquiere'}
Sentence 12: {'doc_id': 2, 'IDF_score': 2.70805020110221, 'key': 'capacidades'}
Sentence 13: {'doc_id': 2, 'IDF_score': 2.70805020110221, 'key': 'co