In [30]:
import sys
import nltk.data
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import math
import numpy as np
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree
import re
import operator

In [49]:
input_path = "sample_preprocessed"
output_path = "sample_summary"
dict_path = "dictionary.txt"

In [32]:
df_vec = {}
doc_w_vec = {}
total_docs = 0

In [40]:
def cal_df():
    global df_vec

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    wordnet_lemmatizer = WordNetLemmatizer()
    stop = set(stopwords.words('english'))

    path = input_path
    files = os.listdir(path)

    for f in files:
        try:
            f_path = os.path.join(path, f)
            with open(f_path, 'r', encoding='utf-8') as fp:
                data = fp.read()
            
            sntncs = tokenizer.tokenize(data)
            nor_stp_lmt = []

            for s in sntncs:
                s_nor_stp_lmt = ""
                s = s.lower()
                words = word_tokenize(s)
                for w in words:
                    if w not in stop:
                        w = wordnet_lemmatizer.lemmatize(w)
                        s_nor_stp_lmt += w + " "
                nor_stp_lmt.append(s_nor_stp_lmt.strip())

            # Build unique word set per document
            unq_words = set()
            for s in nor_stp_lmt:
                for w in word_tokenize(s):
                    if w != ".":
                        unq_words.add(w)

            # Update DF vector
            for k in unq_words:
                df_vec[k] = df_vec.get(k, 0) + 1

        except Exception as e:
            print(f"Error processing file {f}: {e}")
            continue


In [41]:
def cal_total_doc():
    global total_docs
    path = input_path
    files = os.listdir(path)
    total_docs = len(files)

In [42]:
def get_continuous_chunks(text):
     chunked = ne_chunk(pos_tag(word_tokenize(text)))
     continuous_chunk = []
     current_chunk = []
     for i in chunked:
         if type(i) == Tree:
             current_chunk.append(" ".join([token for token, pos in i.leaves()]))
         elif current_chunk:
             named_entity = " ".join(current_chunk)
             if named_entity not in continuous_chunk:
                     continuous_chunk.append(named_entity)
                     current_chunk = []
             else:
                 continue
     return continuous_chunk

In [43]:
legal_words = []

In [44]:
def read_legal_dict():
    l_f = open(dict_path, "r")
    for wd in l_f:
        legal_words.append(wd)
    l_f.close()

In [57]:
def cal_tf_Idf():
    global legal_words
    global total_docs
    global doc_w_vec
    global df_vec

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    path = input_path
    files = os.listdir(path)

    for f in files:
        try:
            tf_idf_sntnc = {}
            f_path = os.path.join(path, f)
            
            with open(f_path, 'r', encoding='utf-8') as fp:
                data = fp.read()
            
            sntncs = tokenizer.tokenize(data)
            norm_sents = []

            for s in sntncs:
                s_u = s.lower()
                words = word_tokenize(s_u)
                norm_sents.append(" ".join(words))

            # Compute TF
            tf_vec = {}
            length = 0
            for s in norm_sents:
                for w in word_tokenize(s):
                    if w != ".":
                        length += 1
                        tf_vec[w] = tf_vec.get(w, 0) + 1

            tf_idf_doc = {}
            for k in tf_vec:
                tf_vec[k] = float(tf_vec[k]) / float(length)
                tf_idf_doc[k] = tf_vec[k] * math.log10(float(total_docs) / float(df_vec.get(k, 1)))

            doc_w_vec[f] = tf_idf_doc

            std_list = []
            for i in range(len(norm_sents)):
                s = norm_sents[i]
                ac_s = sntncs[i]
                sm = sum(tf_idf_doc.get(w, 0) for w in word_tokenize(s))
                no_of_words = len(word_tokenize(s))
                tf_idf_s = float(sm) / float(no_of_words) if no_of_words > 0 else 0
                tf_idf_sntnc[ac_s] = tf_idf_s
                std_list.append(tf_idf_s)

            # STD and final score
            sd = np.std(std_list)
            for i in range(len(norm_sents)):
                s = norm_sents[i]
                ac_s = sntncs[i]
                ne_list = get_continuous_chunks(sntncs[i])
                e = float(len(ne_list)) / float(len(word_tokenize(s)) or 1)
                d = 1 if any(char.isdigit() for char in s) else 0
                words = word_tokenize(s)
                bag = []
                for wd in words:
                    wd = re.sub(r'[\[\]\(\)\{\}]', '', wd)
                    r = re.compile(wd + ".*")
                    newlist = list(filter(r.match, legal_words))
                    for item in newlist:
                        if item in s:
                            bag.extend(item.split(" "))
                myset = set(bag)
                g = float(len(myset)) / float(len(words) or 1)
                tf_idf_sntnc[ac_s] += sd * (0.2 * d + 0.3 * e + 1.5 * g)

            # 🟡 Pick Top N Sentences for Summary
            top_n = 40
            sorted_x = sorted(tf_idf_sntnc.items(), key=operator.itemgetter(1), reverse=True)
            top_sentences = set([pair[0] for pair in sorted_x[:top_n]])

            # ✅ Preserve original order
            summary = " ".join([s for s in sntncs if s in top_sentences])

            # Write Summary
            file_nm = os.path.join(output_path, f)
            with open(file_nm, "w", encoding='utf-8') as w_f:
                w_f.write(summary)

        except Exception as e:
            print(f"Error processing file {f}: {e}")
            continue


In [58]:
if __name__ == '__main__':
    sys.stdout.flush()
    read_legal_dict()
    cal_df()
   
    cal_total_doc()
    cal_tf_Idf()