NLTK

- Stands for Natural Language Toolkit
- Programs for symbolic and statisitical 'Natural Language Processing(NLP)'
- Lexical Analysis: Word and text tokenizer
- n-gram and collocations
- Part of spech tagger
- Tree model and Text chunker for capturing 
- Named entity recognition

Measure similarity between two sentences using cosine similarity

- Measure of similarity between two nom-zero vectors of an inner product space that measures the cosine of the angle betweeen them
- Similarity = (A.B) / (||A||.||B||) where A and B are vectors
- Nltk.utils have cosine_distance() function

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from langdetect import detect
import numpy as np
import networkx as nx

In [22]:
# Load Bangla stopwords from a file
def load_bangla_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file.readlines()]

In [23]:
# Function to read article for English or Bangla text
def read_article(file_name, language):
    with open(file_name, 'r', encoding='utf-8') as file:
        file_data = file.read()

    if language == 'en':  # English text
        article = file_data.split('.')
    else:  # Bangla text
        article = file_data.split('।')  # Bangla sentence-ending punctuation

    sentences = []
    for sentence in article:
        if language == 'en':
            sentence = sentence.replace('[^a-zA-Z]', ' ').split()
        else:
            sentence = sentence.replace('[^অ-হ]', ' ').split()  # Bangla word tokenization
        sentences.append(sentence)
    
    sentences.pop()  # Remove last empty sentence if any
    return sentences

In [24]:
# Calculate sentence similarity for English or Bangla
def sentence_similarity(sent1, sent2, stop_words=None):
    if stop_words is None:
        stop_words = []

    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    for w in sent1:
        if w in stop_words:
            continue
        vector1[all_words.index(w)] += 1

    for w in sent2:
        if w in stop_words:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)

In [25]:
# Generate similarity matrix for sentences
def gen_sim_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:  # Ignore if both are the same sentence
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [30]:
# Main function to generate summary, supporting both Bangla and English
def generate_summary(file_name, top_n=5):
    # Detect the language of the file
    with open(file_name, 'r', encoding='utf-8') as file:
        file_data = file.read()
        language = detect(file_data)

    print(f"Detected language: {language}")
    
    if language == 'en':
        stop_words = stopwords.words('english')
    else:
        stop_words = load_bangla_stopwords('./sample/bangla_stopwords.txt')

    summarize_text = []

    # Read the article
    sentences = read_article(file_name, language)

    # Generate the sentence similarity matrix
    sentence_similarity_matrix = gen_sim_matrix(sentences, stop_words)

    # Build sentence similarity graph
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)

    # Rank sentences using PageRank
    scores = nx.pagerank(sentence_similarity_graph)

    # Sort sentences by score in descending order
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Extract top N sentences
    for i in range(top_n):
        summarize_text.append(' '.join(ranked_sentences[i][1]))

    # Print summary
    if language == 'en':
        print(f'Summary:\n{" ".join(summarize_text)}')
    else:
        print(f'সারাংশ:\n{" ".join(summarize_text)}')



In [None]:
# Example usage(English)
generate_summary('./sample/english_text.txt', 3)

In [None]:
# Example usage(Bangla)

generate_summary('./sample/bangla_text.txt', 3)