In [11]:
#test

In [12]:
import re
import random
from collections import defaultdict

# Κείμενα
text1 = """Today is our dragon boat festival, in our Chinese culture, to celebrate it with all safe and great in
our lives. Hope you too, to enjoy it as my deepest wishes. Thank your message to show our words to the doctor, as his next contract checking, to all of us.
I got this message to see the approved message. In fact, I have received the message from the professor, to show me, this, a couple of days ago. I am very appreciated the full support of the professor, for our Springer proceedings publication"""

text2 = """During our final discuss, I told him about the new submission — the one we were waiting since
last autumn, but the updates was confusing as it not included the full feedback from reviewer or
maybe editor? Anyway, I believe the team, although bit delay and less communication at recent days, they really tried best for paper and cooperation. We should be grateful, I mean all of us, for the acceptance and efforts until the Springer link came finally last week, I think. Also, kindly remind me please, if the doctor still plan for the acknowledgments section edit before he sending again. Because I didn’t see that part final yet, or maybe I missed, I apologize if so. Overall, let us make sure all are safe and celebrate the outcome with strong coffee and future targets"""

# Συνένωση
all_text = text1 + " " + text2

# Διάσπαση σε λέξεις
words = re.findall(r"\w+", all_text)

# Δημιουργία Markov chain
markov_chain = defaultdict(list)
for i in range(len(words) - 1):
    markov_chain[words[i].lower()].append(words[i + 1])

# Συνάρτηση δημιουργίας νέας πρότασης
def generate_sentence(start_word, length=12):
    word = start_word.lower()
    sentence = [start_word.capitalize()]
    for _ in range(length - 1):
        if word not in markov_chain:
            break
        next_word = random.choice(markov_chain[word])
        sentence.append(next_word)
        word = next_word.lower()
    return " ".join(sentence) + "."

# Δημιουργία 2 προτάσεων
sentence1 = generate_sentence("today", 15)
sentence2 = generate_sentence("during", 15)

print("Ανακατασκευή Πρότασης 1:", sentence1)
print("Ανακατασκευή Πρότασης 2:", sentence2)


Ανακατασκευή Πρότασης 1: Today is our lives Hope you too to show our lives Hope you too to.
Ανακατασκευή Πρότασης 2: During our Springer proceedings publication During our Springer proceedings publication During our Springer proceedings publication.


In [17]:
#!/usr/bin/env python3
# file: reconstruct_pipelines.py
# Path at top as requested.

from typing import List, Tuple
import re
import markovify
import spacy
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
import numpy as np

# --- Input texts (the originals) ---
text1 = ("Today is our dragon boat festival, in our Chinese culture, to celebrate it with all safe and great in "
         "our lives. Hope you too, to enjoy it as my deepest wishes. Thank your message to show our words to the doctor, as his next contract checking, to all of us. "
         "I got this message to see the approved message. In fact, I have received the message from the professor, to show me, this, a couple of days ago. "
         "I am very appreciated the full support of the professor, for our Springer proceedings publication")

text2 = ("During our final discuss, I told him about the new submission — the one we were waiting since "
         "last autumn, but the updates was confusing as it not included the full feedback from reviewer or maybe editor? "
         "Anyway, I believe the team, although bit delay and less communication at recent days, they really tried best for paper and cooperation. "
         "We should be grateful, I mean all of us, for the acceptance and efforts until the Springer link came finally last week, I think. "
         "Also, kindly remind me please, if the doctor still plan for the acknowledgments section edit before he sending again. "
         "Because I didn’t see that part final yet, or maybe I missed, I apologize if so. Overall, let us make sure all are safe and celebrate the outcome with strong coffee and future targets")

CORPUS = text1 + "\n" + text2

# --- Utility functions ---

def normalize_text(s: str) -> str:
    s = s.replace('\n', ' ').strip()
    s = re.sub(r"\s+", ' ', s)
    return s

# --- Pipeline A: Markovify ---
# Pros: simple, fast. Cons: poor semantics, can hallucinate.

def pipeline_markov(corpus: str, sentences: int = 5, state_size: int = 2) -> str:
    model = markovify.Text(corpus, state_size=state_size)
    out = []
    for _ in range(sentences):
        s = model.make_sentence(tries=100)
        if s:
            out.append(s)
    return ' '.join(out)

# --- Pipeline B: spaCy rule-based rewriting ---
# Heuristic approach: merge short sentences, remove repetitions, fix simple grammar issues.

nlp = spacy.load('en_core_web_sm')

def clean_sentence(sent: str) -> str:
    # basic fixes: spacing, remove duplicate words sequences
    sent = sent.strip()
    sent = re.sub(r"\s+", ' ', sent)
    # remove repeated short phrases like 'the message the message'
    sent = re.sub(r"\b(\w+)(?:\s+\1\b)+", r"\1", sent, flags=re.I)
    return sent


def pipeline_spacy_rule(corpus: str) -> str:
    doc = nlp(corpus)
    sentences = list(doc.sents)
    combined = []
    buffer = []
    for sent in sentences:
        s_text = clean_sentence(sent.text)
        # if sentence short, buffer it to merge with next for coherence
        if len(s_text.split()) < 8:
            buffer.append(s_text)
        else:
            if buffer:
                buffer.append(s_text)
                combined.append(' '.join(buffer))
                buffer = []
            else:
                combined.append(s_text)
    if buffer:
        combined.append(' '.join(buffer))

    # Post-process: ensure punctuation and capitalization
    out_sents = []
    for s in combined:
        s = s.strip()
        if not s:
            continue
        if s[-1] not in '.!?':
            s = s + '.'
        s = s[0].upper() + s[1:]
        out_sents.append(s)
    return ' '.join(out_sents)

# --- Pipeline C: Transformer (Hugging Face seq2seq) ---
# Use a summarization/paraphrase model via pipeline('text2text-generation' or 'summarization')

# NOTE: model downloads occur on first run. Choose a medium-sized model for balance.

def pipeline_transformer_paraphrase(corpus: str, model_name: str = 'facebook/bart-large-cnn') -> str:
    summarizer = pipeline('summarization', model=model_name)
    # split into chunks if corpus long
    max_chunk = 800
    chunks = []
    text = normalize_text(corpus)
    while text:
        chunk = text[:max_chunk]
        # try to cut at sentence end
        last = chunk.rfind('.')
        if last != -1 and last > int(max_chunk * 0.6):
            chunk = chunk[:last+1]
        chunks.append(chunk)
        text = text[len(chunk):].lstrip()
    results = []
    for c in chunks:
        res = summarizer(c, max_length=180, min_length=40, do_sample=False)
        results.append(res[0]['summary_text'])
    return ' '.join(results)

# --- Evaluation metrics ---

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)


def evaluate_reconstruction(original: str, reconstructed: str) -> dict:
    # ROUGE: compare the two texts
    rouge_scores = scorer.score(original, reconstructed)
    # semantic similarity: compute embedding cosine similarity
    emb_orig = sbert_model.encode([original], convert_to_numpy=True)
    emb_recon = sbert_model.encode([reconstructed], convert_to_numpy=True)
    cos = float(np.dot(emb_orig, emb_recon.T) / (np.linalg.norm(emb_orig) * np.linalg.norm(emb_recon)))
    return {
        'rouge1_f': rouge_scores['rouge1'].fmeasure,
        'rouge2_f': rouge_scores['rouge2'].fmeasure,
        'rougeL_f': rouge_scores['rougeL'].fmeasure,
        'semantic_cosine': cos
    }

# --- Run all pipelines and evaluate ---

if __name__ == '__main__':
    corpus = normalize_text(CORPUS)

    print('Running Pipeline A: Markovify...')
    recon_a = pipeline_markov(corpus, sentences=8, state_size=2)

    print('Running Pipeline B: spaCy rule-based...')
    recon_b = pipeline_spacy_rule(corpus)

    print('Running Pipeline C: Transformer summarization/paraphrase...')
    recon_c = pipeline_transformer_paraphrase(corpus, model_name='facebook/bart-large-cnn')

    # Evaluate against concatenated original (we treat the reconstruction goal as preserving overall meaning)
    original_concat = normalize_text(text1) + ' ' + normalize_text(text2)

    print('\nEvaluating...')
    eval_a = evaluate_reconstruction(original_concat, recon_a)
    eval_b = evaluate_reconstruction(original_concat, recon_b)
    eval_c = evaluate_reconstruction(original_concat, recon_c)

    # Present concise comparison
    from pprint import pprint
    print('\n--- Reconstructions ---\n')
    print('--- Markovify (A) ---\n')
    print(recon_a[:1000])
    print('\n--- spaCy Rule-based (B) ---\n')
    print(recon_b[:1000])
    print('\n--- Transformer (C) ---\n')
    print(recon_c[:1000])

    print('\n--- Metrics (higher better) ---\n')
    print('Markovify A:')
    pprint(eval_a)
    print('\nspaCy Rule-based B:')
    pprint(eval_b)
    print('\nTransformer C:')
    pprint(eval_c)

    # Short recommendations based on metrics
    print('\nRecommendations:')
    # typically transformer will have best semantic similarity and ROUGE, spaCy will give most grammatical control,
    # Markovify is fast but lowest semantic accuracy.
    print('- Transformer (C) usually yields the best semantic fidelity and coherence for this task.')
    print('- spaCy (B) is valuable for rule-based, controllable rewrites without large models.')
    print('- Markov (A) is useful for creative variants but not for faithful reconstruction.')


Running Pipeline A: Markovify...
Running Pipeline B: spaCy rule-based...
Running Pipeline C: Transformer summarization/paraphrase...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  20%|##        | 325M/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Your max_length is set to 180, but your input_length is only 170. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=85)
Your max_length is set to 180, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)



Evaluating...

--- Reconstructions ---

--- Markovify (A) ---

Overall, let us make sure all are safe and great in our Chinese culture, to celebrate it with all safe and celebrate the outcome with strong coffee and future targets I am very appreciated the full feedback from reviewer or maybe I missed, I apologize if so. Also, kindly remind me please, if the doctor still plan for the acceptance and efforts until the Springer link came finally last week, I think. Also, kindly remind me please, if the doctor still plan for the acceptance and efforts until the Springer link came finally last week, I think. Also, kindly remind me please, if the doctor still plan for the acceptance and efforts until the Springer link came finally last week, I think. Thank your message to see the approved message. Also, kindly remind me please, if the doctor still plan for the acceptance and efforts until the Springer link came finally last week, I think. I got this message to show me, this, a couple of days

  cos = float(np.dot(emb_orig, emb_recon.T) / (np.linalg.norm(emb_orig) * np.linalg.norm(emb_recon)))


In [16]:
import re
import spacy
from transformers import pipeline
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Κείμενα
text1 = """Today is our dragon boat festival, in our Chinese culture, to celebrate it with all safe and great in our lives. Hope you too, to enjoy it as my deepest wishes. Thank your message to show our words to the doctor, as his next contract checking, to all of us. I got this message to see the approved message. In fact, I have received the message from the professor, to show me, this, a couple of days ago. I am very appreciated the full support of the professor, for our Springer proceedings publication."""

text2 = """During our final discuss, I told him about the new submission — the one we were waiting since last autumn, but the updates was confusing as it not included the full feedback from reviewer or maybe editor? Anyway, I believe the team, although bit delay and less communication at recent days, they really tried best for paper and cooperation. We should be grateful, I mean all of us, for the acceptance and efforts until the Springer link came finally last week, I think. Also, kindly remind me please, if the doctor still plan for the acknowledgments section edit before he sending again. Because I didn’t see that part final yet, or maybe I missed, I apologize if so. Overall, let us make sure all are safe and celebrate the outcome with strong coffee and future targets."""

all_texts = [text1, text2]

# --- Pipeline 1: spaCy cleanup ---
nlp = spacy.load("en_core_web_sm")
def spacy_reconstruct(text):
    doc = nlp(text)
    sentences = [sent.text.strip().capitalize() for sent in doc.sents]
    return " ".join(sentences)

# --- Pipeline 2: Sumy TextRank summarizer ---
def textrank_reconstruct(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, 5)
    return " ".join(str(sentence) for sentence in summary)

# --- Pipeline 3: Transformers paraphrase/summarization ---
summarizer_hf = pipeline("summarization", model="facebook/bart-large-cnn")
def hf_reconstruct(text):
    result = summarizer_hf(text, max_length=120, min_length=40, do_sample=False)
    return result[0]['summary_text']

# Ανακατασκευή για κάθε κείμενο
reconstructed_texts = []
for i, txt in enumerate(all_texts, start=1):
    print(f"===== Κείμενο {i} =====")
    print("--- spaCy ---")
    spa_out = spacy_reconstruct(txt)
    print(spa_out)
    print("\n--- Sumy TextRank ---")
    sumy_out = textrank_reconstruct(txt)
    print(sumy_out)
    print("\n--- Transformers ---")
    hf_out = hf_reconstruct(txt)
    print(hf_out)
    print("\n")
    reconstructed_texts.append({
        "spacy": spa_out,
        "sumy": sumy_out,
        "transformers": hf_out
    })

# --- Εκτύπωση τελικού συγκεντρωτικού κειμένου ---
print("===== Τελική Ανακατασκευή Κειμένων =====")
for i, recon in enumerate(reconstructed_texts, start=1):
    print(f"Κείμενο {i} (spaCy): {recon['spacy']}\n")
    print(f"Κείμενο {i} (Sumy TextRank): {recon['sumy']}\n")
    print(f"Κείμενο {i} (Transformers): {recon['transformers']}\n")

# --- Σύγκριση ---
print("===== Συγκριτική Αξιολόγηση =====")
print("spaCy: Διατηρεί σχεδόν όλο το κείμενο, με καλύτερη στίξη και καθαρότητα.")
print("Sumy TextRank: Περιορίζει το κείμενο σε λίγες βασικές προτάσεις, αλλά μπορεί να χάσει λεπτομέρειες.")
print("Transformers: Δίνει αναδιατυπωμένη, πιο συνοπτική εκδοχή με καλύτερη συνοχή και ροή.")


ModuleNotFoundError: No module named 'sumy'