In [None]:
import os
folder_path = '/Users/ivana/Desktop/DSBA/2nd semester/NLP/project/einstein_files/einsteincleaned'

all_texts = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        except UnicodeDecodeError:
            try:
                with open(file_path, 'r', encoding='latin1') as f:
                    text = f.read()
                print(f"⚠ File {filename} letto con encoding latin1")
            except UnicodeDecodeError:
                print(f"❌ ATTENZIONE: Impossibile leggere {filename}, saltato")
                continue
        all_texts.append(text)
        print(f"✅ Caricato: {filename}")

full_corpus = '\n'.join(all_texts)

print(f"\n✅ Totale file caricati: {len(all_texts)}")
print(f"✅ Numero totale di caratteri: {len(full_corpus)}")


In [None]:
import re
import nltk
import unicodedata
from langdetect import detect
from tqdm import tqdm

nltk.download('punkt')

def clean_text(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'\n\s*\n', '\n', text)
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[-–—]{2,}', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'"+', '"', text)  
    return text.strip()

def is_clean_sentence(s):
    if len(s) < 30 or len(s) > 1000:
        return False
    if sum(c.isalpha() for c in s) / len(s) < 0.4:  
        return False
    if not re.search(r'[aeiou]', s, re.IGNORECASE):
        return False
    return True

def preprocess_corpus(all_texts, lang='en', to_paragraphs=True, max_paragraph_len=800):
    all_sentences = set()
    for doc in tqdm(all_texts, desc='Preprocessing documents'):
        cleaned = clean_text(doc)
        sentences = nltk.sent_tokenize(cleaned)
        for s in sentences:
            s = s.strip()
            try:
                if detect(s) == lang and is_clean_sentence(s):
                    all_sentences.add(s)
            except:
                continue

    all_sentences = sorted(list(all_sentences))

    if to_paragraphs:
        paragraphs = []
        current_paragraph = ""
        for s in all_sentences:
            if len(current_paragraph) + len(s) + 1 <= max_paragraph_len:
                current_paragraph += " " + s if current_paragraph else s
            else:
                paragraphs.append(current_paragraph)
                current_paragraph = s
        if current_paragraph:
            paragraphs.append(current_paragraph)
        return paragraphs
    else:
        return all_sentences

preprocessed_sentences = preprocess_corpus(
    all_texts,
    to_paragraphs=True,
    max_paragraph_len=800
)

print(f"✅ Frasi/Paragrafi puliti: {len(preprocessed_sentences)}")
print(f"Esempi:\n{preprocessed_sentences[:5]}")

output_path = '/Users/ivana/Desktop/DSBA/2nd semester/NLP/project/einstein_files/einsteincleaned/einstein_cleaned_final.txt'
with open(output_path, 'w', encoding='utf-8') as f:
    for line in preprocessed_sentences:
        f.write(line + '\n')

print(f"✅ Corpus salvato in {output_path}")

import random
print("Esempi casuali:")
print(random.sample(preprocessed_sentences, 5))


