# Extractive Summarization 1

In [None]:
!pip install bert_score
!pip install nltk
!pip install rouge-score
!pip install gradio
!pip install networkx
!pip install scikit-learn



In [None]:
import pandas as pd
import numpy as np
import re
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import score as bert_score
import nltk
import torch
from nltk.tokenize import sent_tokenize
import networkx as nx
import gradio as gr

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
file_path = 'my_data.csv'
news_data = pd.read_csv(file_path)

# Clean and Prepare Data
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', str(text))
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

news_data = news_data.dropna()
news_data['Cleaned_Text'] = news_data['Text'].apply(clean_text)
news_data['Cleaned_Summary'] = news_data['Summary'].apply(clean_text)

# Train FastText Embeddings
def train_embeddings(texts, vector_size=100):
    tokenized_texts = [re.findall(r'\b\w+\b', text.lower()) for text in texts]
    model = FastText(vector_size=vector_size, window=3, min_count=1)
    model.build_vocab(corpus_iterable=tokenized_texts)
    model.train(corpus_iterable=tokenized_texts, total_examples=len(tokenized_texts), epochs=10)
    return model

fasttext_model = train_embeddings(news_data['Cleaned_Text'])

# Extract Keywords Using TF-IDF
def extract_keywords(text, num_keywords=5):
    vectorizer = TfidfVectorizer(stop_words=None)
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_array = np.array(vectorizer.get_feature_names_out())
    tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
    return feature_array[tfidf_sorting][:num_keywords]

# Advanced Summarization
def advanced_summarize(text, num_sentences=3, model=None):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    tokenized_sentences = [re.findall(r'\b\w+\b', sentence.lower()) for sentence in sentences]

    # Compute Sentence Embeddings
    sentence_embeddings = []
    for tokens in tokenized_sentences:
        word_vectors = [model.wv[word] for word in tokens if word in model.wv]
        sentence_embeddings.append(np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size))

    # Document Embedding
    doc_embedding = np.mean(sentence_embeddings, axis=0)

    # Semantic Similarity
    similarities = cosine_similarity([doc_embedding], sentence_embeddings)[0]

    # Sentence Position Scores
    position_scores = np.linspace(1, 0, len(sentences))

    # Keyword Scores
    keywords = extract_keywords(text)
    keyword_scores = [
        sum(1 for word in re.findall(r'\b\w+\b', sentence.lower()) if word in keywords)
        for sentence in sentences
    ]

    # Combined Scoring
    combined_scores = (
        0.4 * similarities + 0.3 * position_scores +
        0.1 * np.array(keyword_scores)
    )

    # Rank Sentences
    ranked_sentences = [sentences[i] for i in np.argsort(combined_scores)[::-1]]
    summary = " ".join(ranked_sentences[:num_sentences])
    return summary

# Apply Summarization
news_data['Advanced_Summary'] = news_data['Cleaned_Text'].apply(
    lambda x: advanced_summarize(x, num_sentences=3, model=fasttext_model)
)

In [None]:
# Evaluation Metrics
def calculate_rouge_scores(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        'ROUGE-1': scores['rouge1'].fmeasure,
        'ROUGE-2': scores['rouge2'].fmeasure,
        'ROUGE-L': scores['rougeL'].fmeasure
    }

def calculate_bleu(reference, generated):
    reference_tokens = reference.split()
    generated_tokens = generated.split()
    return sentence_bleu([reference_tokens], generated_tokens)

def calculate_meteor(reference, generated):
    reference_tokens = nltk.word_tokenize(reference)
    generated_tokens = nltk.word_tokenize(generated)
    return meteor_score([reference_tokens], generated_tokens)

def calculate_bertscore(references, candidates):
    precision, recall, f1 = bert_score(candidates, references, lang="ky", rescale_with_baseline=True)
    return np.mean(f1.cpu().detach().numpy())

# Evaluate Summaries
def evaluate_summaries(data):
    rouge_results = []
    bleu_scores = []
    meteor_scores = []
    references = []
    candidates = []

    for _, row in data.iterrows():
        reference = row['Cleaned_Summary']
        generated = row['Advanced_Summary']

        rouge = calculate_rouge_scores(reference, generated)
        rouge_results.append(rouge)

        bleu_scores.append(calculate_bleu(reference, generated))
        meteor_scores.append(calculate_meteor(reference, generated))
        references.append(reference)
        candidates.append(generated)

    bertscore_avg = calculate_bertscore(references, candidates)

    # Aggregate ROUGE
    avg_rouge = {
        'ROUGE-1': np.mean([r['ROUGE-1'] for r in rouge_results]),
        'ROUGE-2': np.mean([r['ROUGE-2'] for r in rouge_results]),
        'ROUGE-L': np.mean([r['ROUGE-L'] for r in rouge_results])
    }

    return {
        'ROUGE': avg_rouge,
        'BLEU': np.mean(bleu_scores),
        'METEOR': np.mean(meteor_scores),
        'BERTScore': bertscore_avg
    }

# Run Evaluation
evaluation_results = evaluate_summaries(news_data)

# Print Results
print("Evaluation Results:")
for metric, score in evaluation_results.items():
    if isinstance(score, dict):
        for sub_metric, sub_score in score.items():
            print(f"{metric}-{sub_metric}: {sub_score:.4f}")
    else:
        print(f"{metric}: {score:.4f}")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that

Evaluation Results:
ROUGE-ROUGE-1: 0.4577
ROUGE-ROUGE-2: 0.2000
ROUGE-ROUGE-L: 0.4406
BLEU: 0.0891
METEOR: 0.4464
BERTScore: 0.7413




In [None]:
def summarize_unseen_text(input_text):
    try:
        summary = advanced_summarize(input_text, num_sentences=3, model=fasttext_model)
        return summary
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
text1 = """

Украинанын президенти Владимир Зеленский Американын шайланган президенти Дональд Трамптын "күтүүсүз, болжолдоого мүмкүн болбогон мүнөзү" Орусия менен согушка чекит коюуга жардам бериши ыктымал экенин билдирди.

Ал мындай пикирин 2-январда обого чыккан маегинде айтты.

Зеленскийдин пикиринде, Орусиянын президенти Владимир Путин Трамптан коркот.

"Трамп абдан күчтүү, мүнөзү күтүүсүз, болжолдоого мүмкүн эмес. Мен анын дал ушундай мүнөзү Орусияга карата колдонулушун абдан каалайт элем",- деди украин президенти.

Зеленский кошумчалагандай, Трамп инаугурациядан кийин биринчилерден болуп аны менен жолугушууга убада берген.

20-январда ант берип, кызматка кирише турган Дональд Трамп бийликке расмий келери менен 24 сааттын ичинде жаңжалды токтоторун билдирген.

Украина жана башка мамлекеттер алгач бул билдирүүгө сын көз карашта карашкан. Киев ошондой эле Трамп даярдаган макулдашууда Украинанын аймактарын Орусияга өткөрүп берүү камтылган болушу мүмкүн деп кооптонуп турат.
"""

print(summarize_unseen_text(text1))



Украинанын президенти Владимир Зеленский Американын шайланган президенти Дональд Трамптын "күтүүсүз, болжолдоого мүмкүн болбогон мүнөзү" Орусия менен согушка чекит коюуга жардам бериши ыктымал экенин билдирди. "Трамп абдан күчтүү, мүнөзү күтүүсүз, болжолдоого мүмкүн эмес. Мен анын дал ушундай мүнөзү Орусияга карата колдонулушун абдан каалайт элем",- деди украин президенти.


In [None]:
interface = gr.Interface(
    fn=summarize_unseen_text,
    inputs=gr.Textbox(
        lines=10,
        placeholder="Enter your Kyrgyz text here...",
        label="Input Text"
    ),
    outputs=gr.Textbox(
        lines=10,
        label="Generated Summary",
        interactive=False
    ),
    title="Kyrgyz News Summarizer",
    description="Input a Kyrgyz text to generate a concise summary."
)

interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b851519a6632473c32.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)





---

# Extractive Summarization 2

In [None]:
def kyrgyz_tokenize(text):
    return sent_tokenize(text)

# Extractive summarization using TextRank
def extractive_summary(text, num_sentences=3):
    # Tokenize sentences
    sentences = kyrgyz_tokenize(text)
    if len(sentences) <= num_sentences:
        return ' '.join(sentences)

    # TF-IDF Vectorizer for sentence embeddings
    vectorizer = TfidfVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences).toarray()

    # Compute similarity matrix
    sim_matrix = cosine_similarity(sentence_vectors)
    np.fill_diagonal(sim_matrix, 0)

    # Apply TextRank
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    # Rank sentences
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary = ' '.join([s for _, s in ranked_sentences[:num_sentences]])
    return summary


In [None]:
# Test on a sample
sample_text = news_data['Text'].iloc[9]

print("Original Text:")
print(sample_text)

print("\nExtractive Summary:")
print(extractive_summary(sample_text, num_sentences=2))

Original Text:
Бишкек шаарында 2-декабрдан тартып коомдук транспорт, автобустун айдоочуларына жол кире үчүн накталай акча төлөө планы жокко чыгарылат. Бул тууралуу калаа мэриясынан кабарлашты. Маалыматка караганда, ушул күндөн тартып автобустардагы жүргүнчүлөрдү эсептөө системасы аркылуу төлөмдөрдү көзөмөлдөө күчөтүлөт.“Эгерде иш жүзүндөгү төлөм менен салыштырганда каражаттын жетишсиздиги аныкталса, айдоочуларга мыйзам чегинде чара көрүлөт”, - деп белгиленген маалыматта.Буга чейин Бишкек шаардык мэриясы коомдук транспортто накталай төлөгөндөргө жол кирени 40 сомго чейин көтөрүү сунушун берген. Бул шаар тургундарынын талкуусун жараткан.Президент Садыр Жапаров Бишкек мэриясынын сунушуна байланыштуу пикирин билдирип, накталай төлөмдөр көбүнесе айдоочунун чөнтөгүнө кетип жатканын Фейсбук баракчасына жазган. Ошону менен катар акчаны алууга алаксыган айдоочу жол эрежесин бузуп, кырсыктар катталып жатканын белгилеген. Президент шаар тургундарын жол кирени 17 сомдон төлөө үчүн карта сатып алуу

In [None]:
# Evaluation Metrics
def calculate_rouge_scores(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        'ROUGE-1': scores['rouge1'].fmeasure,
        'ROUGE-2': scores['rouge2'].fmeasure,
        'ROUGE-L': scores['rougeL'].fmeasure
    }

def calculate_bleu(reference, generated):
    reference_tokens = reference.split()
    generated_tokens = generated.split()
    return sentence_bleu([reference_tokens], generated_tokens)

def calculate_meteor(reference, generated):
    reference_tokens = nltk.word_tokenize(reference)
    generated_tokens = nltk.word_tokenize(generated)
    return meteor_score([reference_tokens], generated_tokens)

def calculate_bertscore(references, candidates):
    precision, recall, f1 = bert_score(candidates, references, lang="ky ", rescale_with_baseline=True)
    return np.mean(f1.cpu().detach().numpy())

# Evaluate Summaries
def evaluate_summaries(news_data, num_sentences=3):
    rouge_results = []
    bleu_scores = []
    meteor_scores = []
    references = []
    candidates = []

    for _, row in news_data.iterrows():
        reference = row['Summary']
        text = row['Text']
        generated = extractive_summary(text, num_sentences)

        rouge = calculate_rouge_scores(reference, generated)
        rouge_results.append(rouge)

        bleu_scores.append(calculate_bleu(reference, generated))
        meteor_scores.append(calculate_meteor(reference, generated))
        references.append(reference)
        candidates.append(generated)

    bertscore_avg = calculate_bertscore(references, candidates)

    # Aggregate ROUGE
    avg_rouge = {
        'ROUGE-1': np.mean([r['ROUGE-1'] for r in rouge_results]),
        'ROUGE-2': np.mean([r['ROUGE-2'] for r in rouge_results]),
        'ROUGE-L': np.mean([r['ROUGE-L'] for r in rouge_results])
    }

    return {
        'ROUGE': avg_rouge,
        'BLEU': np.mean(bleu_scores),
        'METEOR': np.mean(meteor_scores),
        'BERTScore': bertscore_avg
    }

# Run Evaluation
evaluation_results = evaluate_summaries(news_data)

# Print Results
print("Evaluation Results:")
for metric, score in evaluation_results.items():
    if isinstance(score, dict):
        for sub_metric, sub_score in score.items():
            print(f"{metric}-{sub_metric}: {sub_score:.4f}")
    else:
        print(f"{metric}: {score:.4f}")


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

Evaluation Results:
ROUGE-ROUGE-1: 0.3661
ROUGE-ROUGE-2: 0.1152
ROUGE-ROUGE-L: 0.3118
BLEU: 0.0986
METEOR: 0.3609
BERTScore: 0.7422




In [None]:
def gradio_interface(text, num_sentences):
    summary = extractive_summary(text, num_sentences)
    return summary

interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Input Text (Kyrgyz)", lines=5, placeholder="Enter text for summarization"),
        gr.Number(label="Number of Sentences", value=3, interactive=True)
    ],
    outputs=gr.Textbox(label="Generated Summary"),
    title="Extractive Kyrgyz Text Summarization",
    description="Input a Kyrgyz text to generate a concise summary."
)

interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6c02d280a0203f19d2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# RESULTS

Extractive Summarization 1 Evaluation Results:
- ROUGE-ROUGE-1: 0.4577
- ROUGE-ROUGE-2: 0.2000
- ROUGE-ROUGE-L: 0.4406
- BLEU: 0.0891
- METEOR: 0.4464
- BERTScore: 0.7413


---

Extractive Summarization 2 Evaluation Results:
- ROUGE-ROUGE-1: 0.3661
- ROUGE-ROUGE-2: 0.1152
- ROUGE-ROUGE-L: 0.3118
- BLEU: 0.0986
- METEOR: 0.3609
- BERTScore: 0.7422

