### Library Imports

In [1]:
from nltk import sent_tokenize, PorterStemmer, word_tokenize
from nltk.corpus import stopwords

### Import and Tokenize the Text File

In [60]:
text = open("D:/ISDC Work/Python Codes/Text_Mining_SJCC/text_data/new_t1.txt").read()

In [61]:
sentences = word_tokenize(text)
print(sentences)

['This', 'story', 'takes', 'place', 'in', 'its', 'own', 'universe', '.', 'It', 'has', 'no', 'connection', 'to', 'any', 'of', 'the', 'DC', 'films', 'that', 'have', 'come', 'before', 'it', '.', 'We', 'see', 'it', 'as', 'a', 'classic', 'Warner', 'Bros.', 'movie', '.', 'Gritty', ',', 'intimate', 'and', 'oddly', 'funny', ',', 'the', 'characters', 'live', 'in', 'the', 'real', 'world', 'and', 'the', 'stakes', 'are', 'personal', '.', 'Although', 'it', 'is', 'never', 'mentioned', 'in', 'the', 'film', ',', 'this', 'story', 'takes', 'place', 'in', 'the', 'past', '.', 'Let', "'s", 'call', 'it', '1981', '.', 'It', "'s", 'a', 'troubled', 'time', '.', 'The', 'crime', 'rate', 'in', 'Gotham', 'is', 'at', 'record', 'highs', '.', 'A', 'garbage', 'strike', 'has', 'crippled', 'the', 'city', 'for', 'the', 'past', 'six', 'weeks', '.', 'And', 'the', 'divide', 'between', 'the', '``', 'haves', "''", 'and', 'the', '``', 'have-', 'nots', "''", 'is', 'palpable', '.', 'Dreams', 'are', 'beyond', 'reach', ',', 'slipp

In [62]:
total_documents = len(sentences)
total_documents

1227

### Creating Frequency Matrix for the tokens

In [63]:
def create_frequency_matrix(sentences):
    frequency_matrix = {}
    sw = set(stopwords.words('english'))
    ps = PorterStemmer()
    
    for sent in sentences: 
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in sw:
                continue
            if word in freq_table:
                freq_table[word] = freq_table[word] + 1
            else:
                freq_table[word] = 1
        frequency_matrix[sent[:15]] = freq_table
    return frequency_matrix

### Manual Term-Frequency Computation

In [64]:
def create_tf_matrix(freq_matrix):
    tf_matrix = {}
    for sent, f_table in freq_matrix.items():
        tf_table = {}
        count_words_in_sent = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sent
        tf_matrix[sent] = tf_table
    return tf_matrix

### Creating a table for document per words

In [65]:
def create_document_per_words(freq_matrix):
    word_per_doc_table = {}
    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] = word_per_doc_table[word] + 1
            else:
                word_per_doc_table[word] = 1
    return word_per_doc_table

### Manual IDF Computation

In [66]:
import math
def create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}
    for sent, f_table in freq_matrix.items():
        idf_table = {}
        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
        idf_matrix[sent] = idf_table
    return idf_matrix

### TF-IDF Computation

In [67]:
def create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}
    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
        tf_idf_table = {}
        for (word1, value1), (word2, value2) in zip(f_table1.items(), f_table2.items()):
            tf_idf_table[word1] = float(value1 * value2)
        tf_idf_matrix[sent1] = tf_idf_table
    return tf_idf_matrix

### Weighing the words in a sentence - Scoring

In [79]:
def score_sentences(tf_idf_matrix) -> dict: 
    sentence_val = {}
    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0
        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence = total_score_per_sentence + score
        if count_words_in_sentence > 0:
            sentence_val[sent] = total_score_per_sentence / count_words_in_sentence
    return sentence_val

### Average sentence score - Threshold

In [93]:
def find_average_score(sentence_val) -> int:
    sum_values = 0
    for entry in sentence_val:
        sum_values = sum_values + sentence_val[entry]
    average = sum_values / len(sentence_val)
    return average

 ### Generating Summary

In [83]:
def generate_summary(sentences, sentence_val, threshold):
    sentence_count = 0
    summary = ""
    for sentence in sentences:
        if sentence[:15] in sentence_val and sentence_val[sentence[:15]] >= (threshold):
            summary = summary + " " + sentence
            sentence_count = sentence_count + 1
    return summary

### Call everything and get the summarization done

In [84]:
freq_matrix = create_frequency_matrix(sentences)

In [85]:
tf_matrix = create_tf_matrix(freq_matrix)

In [86]:
count_doc_per_words = create_document_per_words(freq_matrix)

In [87]:
idf_matrix = create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)

In [88]:
tf_idf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix)

In [100]:
sentence_scores = score_sentences(tf_idf_matrix)

In [101]:
threshold = find_average_score(sentence_scores)

In [99]:
summary = generate_summary(sentences, sentence_scores, threshold)
summary

" story takes place universe has connection any DC come before see classic Warner movie Gritty , intimate oddly , characters live real world stakes personal Although never mentioned , story takes place past call 1981 crime rate Gotham record garbage strike has crippled city past six weeks divide `` '' `` have- '' palpable Dreams beyond , slipping delusions : HEAR sound man totally cracking : 1 INT DEPT HEALTH , - MORNING 1 CLOSE ( 30 ) , tears eyes hard control greasy , hair matted wearing , red hooded zip-up sweatshirt , threadbare gray scarf , thin years , loosely around neck NOTICE SCARS cut corners mouth forming across overworked ( 50 ) , African American cramped run-down cramped run-down building Stacks folders piled front desk , waiting fit end , before subsides takes deep breath , see -- , crazier ? Despite , real pain eyes broken like has slept certainly tense People upset , 're struggling work garbage strike like going forever tough ( ) 'bout job ? Still enjoying ? , mean , , 