In [16]:
import os

# Belgeler klasörünü oluştur (varsa sorun olmaz)
os.makedirs("/content/drive/MyDrive/Belgeler", exist_ok=True)

# TXT dosyasının yolu
txt_path = "/content/drive/MyDrive/Belgeler/ai_article.txt"

# İçerik (örnek İngilizce metin)
english_text = """Artificial intelligence (AI) is rapidly transforming industries worldwide.
From healthcare to finance and transportation, AI systems are being used to analyze massive datasets,
recognize complex patterns, and assist humans in making critical decisions.
Breakthroughs in deep learning and natural language processing have led to impressive advances
in image recognition, speech synthesis, and text generation.

However, building and deploying AI responsibly requires more than high accuracy.
Modern AI faces three major challenges: explainability, efficiency, and security.
Explainability ensures that models are transparent and that their decisions can be trusted by doctors,
policymakers, and end users. Efficiency focuses on reducing computational costs through model compression,
pruning, and quantization so that AI can run effectively even on limited hardware.
Security involves defending models against adversarial attacks, data poisoning, and distribution shifts
that could degrade performance in real-world environments.

To address these challenges, researchers and engineers are developing new techniques such as interpretable
machine learning, knowledge distillation, and adversarial training.
These methods not only improve technical performance but also enhance the reliability and fairness of AI applications.
Ultimately, responsible AI will combine cutting-edge innovation with practical safeguards,
enabling technology to deliver long-term value for society."""

# Dosyaya yaz
with open(txt_path, "w", encoding="utf-8") as f:
    f.write(english_text)

print("TXT dosyası başarıyla oluşturuldu:", txt_path)


TXT dosyası başarıyla oluşturuldu: /content/drive/MyDrive/Belgeler/ai_article.txt


In [38]:
!pip -q install -U requests==2.32.4
!pip -q install transformers==4.44.2 accelerate==0.34.2 nltk==3.9.1


In [48]:
import re, heapq, nltk, torch
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed

# NLTK veri setleri
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
set_seed(42)

# >>> MODEL: gpt2-medium (345M parametre)
MODEL_NAME = "gpt2-medium"
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else None
).to(DEVICE)
model.eval()

EN_STOP = set(stopwords.words("english"))

# -------- Girdi temizliği --------
def sanitize_email_forums(text: str) -> str:
    t = re.sub(r'^(From|To|Cc|Subject|Date|Reply-To):.*$', ' ', text, flags=re.MULTILINE|re.IGNORECASE)
    t = re.sub(r'-{6,}\s*next part\s*-{6,}.*$', ' ', t, flags=re.IGNORECASE|re.MULTILINE|re.DOTALL)
    t = re.sub(r'An HTML attachment was scrubbed.*$', ' ', t, flags=re.IGNORECASE|re.MULTILINE)
    t = re.sub(r'https?://\S+|www\.\S+', ' ', t)
    t = re.sub(r'<[^>\n]+>', ' ', t)
    t = re.sub(r'\S+@\S+', ' ', t)
    t = re.sub(r'\b[0-9a-f]{16,}\b', ' ', t, flags=re.IGNORECASE)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

# -------- Ekstraktif özet (önemli cümle seçimi) --------
def clean_text(t: str) -> str:
    t = re.sub(r'\u200b', '', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

def extractive_summary(text: str, max_sents: int = 8) -> str:
    text = clean_text(text)
    sents = sent_tokenize(text, language="english")
    if len(sents) <= max_sents:
        return text

    words = word_tokenize(text.lower(), language="english")
    words = [w for w in words if w.isalpha() and w not in EN_STOP]

    freq = {}
    for w in words:
        freq[w] = freq.get(w, 0) + 1
    maxf = max(freq.values()) if freq else 1
    for w in freq:
        freq[w] = freq[w] / maxf

    scored = []
    for i, s in enumerate(sents):
        sw = word_tokenize(s.lower(), language="english")
        sc = sum(freq.get(w, 0) for w in sw)
        scored.append((sc, i, s))

    top = heapq.nlargest(max_sents, scored)
    top_sorted = [s for _, _, s in sorted(top, key=lambda x: x[1])]
    return " ".join(top_sorted)

# -------- Bullet üretimi (ekstraktif, garanti) --------
def extractive_bullets(text: str, n: int = 4) -> list[str]:
    sents = sent_tokenize(text, language="english")
    return [s.strip() for s in sents[:n]]

# -------- Çıktı temizliği --------
def strip_links_emails(text: str) -> str:
    t = re.sub(r'https?://\S+|www\.\S+', '', text)
    t = re.sub(r'<[^>\n]+>', '', t)
    t = re.sub(r'\S+@\S+', '', t)
    t = re.sub(r'[ \t]+\n', '\n', t)
    return t.strip()

# -------- GPT-2 Medium ile paragraf özet (strict prompt) --------
def gpt2_paragraph(text: str,
                   max_new_tokens: int = 160) -> str:
    prompt = (
        "Write ONE short paragraph (3-5 sentences) summarizing the text below. "
        "ONLY use facts from the text. Do not add new topics, names, or predictions. "
        "Keep it factual, concise, and neutral.\n\n"
        f"Text:\n{text}\n\nSummary:"
    )

    inputs = tok(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,              # deterministik mod
            no_repeat_ngram_size=3,
            repetition_penalty=1.12,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
            return_dict_in_generate=True
        )

    # Sadece üretilen kısmı al (promptu atla)
    gen_ids = out.sequences[0][inputs["input_ids"].shape[1]:]
    para = tok.decode(gen_ids, skip_special_tokens=True).strip()
    para = strip_links_emails(para)

    # Fallback: paragraf çok kısa veya boşsa ekstraktif cümlelerden 3-4 tane al
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', para) if s.strip()]
    if len(para) < 40 or len(sents) < 2:
        base = extractive_summary(sanitize_email_forums(text), max_sents=6)
        base_sents = sent_tokenize(base, language="english")[:4]
        para = " ".join(base_sents)
        para = strip_links_emails(para)

    return para


    # Sadece üretilen kısmı al (promptu atla)
    gen_ids = out.sequences[0][inputs["input_ids"].shape[1]:]
    para = tok.decode(gen_ids, skip_special_tokens=True).strip()
    para = strip_links_emails(para)

    # Basit kalite kontrol + Fallback
    #  - boşsa veya tek cümleyse, ekstraktif cümlelerden paragraf kur
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', para) if s.strip()]
    if len(para) < 40 or len(sents) < 2:
        # Ekstraktiften 3-4 cümle ile güvenli paragraf
        base = extractive_summary(sanitize_email_forums(text), max_sents=6)
        base_sents = sent_tokenize(base, language="english")[:4]
        para = " ".join(base_sents)
        para = strip_links_emails(para)

    return para



# -------- Hybrid summarizer --------
def summarize_hybrid(text: str, bullets: int = 4, max_sents: int = 8) -> str:
    cleaned = sanitize_email_forums(text)
    base = extractive_summary(cleaned, max_sents=max_sents)
    blts = extractive_bullets(base, n=bullets)
    para = gpt2_paragraph(base)
    bullets_block = "\n- ".join(blts) if blts else ""
    merged = f"BULLETS:\n- {bullets_block}\n\nPARAGRAPH:\n{para}"
    return strip_links_emails(merged)


In [49]:
txt_path = "/content/drive/MyDrive/Belgeler/ai_article.txt"  # <-- kendi yolunu yaz
with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
    text_data = f.read()
print("Characters:", len(text_data))


Characters: 1488


In [50]:
print("\n--- HYBRID SUMMARY (gpt2-medium, strict & robust) ---\n")
summary = summarize_hybrid(text_data, bullets=4, max_sents=8)
print(summary)



--- HYBRID SUMMARY (gpt2-medium, strict & robust) ---

BULLETS:
- From healthcare to finance and transportation, AI systems are being used to analyze massive datasets, recognize complex patterns, and assist humans in making critical decisions.
- Breakthroughs in deep learning and natural language processing have led to impressive advances in image recognition, speech synthesis, and text generation.
- Modern AI faces three major challenges: explainability, efficiency, and security.
- Efficiency focuses on reducing computational costs through model compression, pruning, and quantization so that AI can run effectively even on limited hardware.

PARAGRAPH:
From healthcare to finance and transportation, AI systems are being used to analyze massive datasets, recognize complex patterns, and assist humans in making critical decisions. Modern AI faces three major challenges: explainability, efficiency, and security. Efficiency focuses on reducing computational costs through model compression, 

In [51]:
import os

# Belgeler klasörünü oluştur (varsa zaten hata vermez)
os.makedirs("/content/drive/MyDrive/Belgeler", exist_ok=True)

# Kaydedilecek dosya yolu
out_path = "/content/drive/MyDrive/Belgeler/summary.txt"

# Summary'yi yaz
with open(out_path, "w", encoding="utf-8") as f:
    f.write(summary)

print("Saved:", out_path)


Saved: /content/drive/MyDrive/Belgeler/summary.txt


In [52]:
out_path = "/content/drive/MyDrive/Belgeler/summary.txt"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(summary)
print("Saved:", out_path)


Saved: /content/drive/MyDrive/Belgeler/summary.txt
