In [6]:
import pandas as pd
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

df_docs = pd.read_csv('../../data/documents.csv')
df_docs = df_docs.dropna(subset=['Text'])
df_docs['Text'] = df_docs['Text'].astype(str)
total_docs = len(df_docs)

# --- ΑΝΑΛΥΣΗ A: TOP-50 ΜΕΜΟΝΩΜΕΝΕΣ ΛΕΞΕΙΣ ---
print("="*60)
print("ΑΝΑΛΥΣΗ A: TOP-50 ΜΕΜΟΝΩΜΕΝΕΣ ΛΕΞΕΙΣ (SINGLE WORDS)")
print("="*60)

standard_stopwords = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 
    'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 
    'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
    'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 
    'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 
    'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 
    'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 
    'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
])

word_counts = Counter()
for text in df_docs['Text']:
    text_clean = text.lower().strip()
    words = re.findall(r'\b[a-z]{3,}\b', text_clean) 
    filtered_words = [w for w in words if w not in standard_stopwords]
    word_counts.update(filtered_words)

for word, count in word_counts.most_common(50):
    percentage = (count / total_docs) * 100
    print(f"{word}: {count} ({percentage:.1f}%)")


# --- ΑΝΑΛΥΣΗ B: ΑΥΤΟΜΑΤΟΣ ΕΝΤΟΠΙΣΜΟΣ BOILERPLATE PHRASES ---


print("\n" + "="*60)
print("ΕΝΤΟΠΙΣΜΟΣ BOILERPLATE ")
print("="*60)

#
vectorizer = CountVectorizer(
    ngram_range=(3, 6),  
    stop_words=None,      
    min_df=10,            
    lowercase=True
)

X = vectorizer.fit_transform(df_docs['Text'])

doc_freqs = (X > 0).sum(axis=0).A1 

boilerplate_df = pd.DataFrame({
    'phrase': vectorizer.get_feature_names_out(),
    'doc_count': doc_freqs
})

boilerplate_df['length'] = boilerplate_df['phrase'].apply(lambda x: len(x.split()))
boilerplate_df = boilerplate_df.sort_values(by=['length', 'doc_count'], ascending=[False, False])


final_boilerplate = []
seen_phrases = set()

for _, row in boilerplate_df.iterrows():
    candidate = row['phrase']
    count = row['doc_count']
    
    is_subset = False
    for kept_phrase in final_boilerplate:
        if candidate in kept_phrase:
            is_subset = True
            break
    
    if not is_subset:
        final_boilerplate.append(candidate)
        percentage = (count / len(df_docs)) * 100
        if len(final_boilerplate) <= 50: 
            print(f" Κρατήθηκε: '{candidate}' (σε {count} έγγραφα - {percentage:.1f}%)")

print("\n" + "="*60)
print(f"Συνολικά εντοπίστηκαν {len(final_boilerplate)} μοναδικές boilerplate φράσεις.")

boilerplate_phrases = final_boilerplate[:50] 

ΑΝΑΛΥΣΗ A: TOP-50 ΜΕΜΟΝΩΜΕΝΕΣ ΛΕΞΕΙΣ (SINGLE WORDS)
project: 22337 (122.0%)
new: 17531 (95.7%)
research: 16790 (91.7%)
based: 11743 (64.1%)
high: 10238 (55.9%)
data: 10238 (55.9%)
development: 10091 (55.1%)
energy: 9592 (52.4%)
system: 9144 (49.9%)
market: 8885 (48.5%)
european: 8807 (48.1%)
technology: 8566 (46.8%)
use: 8345 (45.6%)
systems: 8211 (44.8%)
develop: 7322 (40.0%)
also: 7137 (39.0%)
novel: 7049 (38.5%)
time: 6745 (36.8%)
using: 6726 (36.7%)
innovation: 6437 (35.1%)
provide: 6356 (34.7%)
innovative: 6323 (34.5%)
well: 6303 (34.4%)
approach: 6241 (34.1%)
study: 6239 (34.1%)
europe: 6206 (33.9%)
cell: 5880 (32.1%)
potential: 5773 (31.5%)
design: 5635 (30.8%)
different: 5630 (30.7%)
process: 5425 (29.6%)
first: 5413 (29.6%)
one: 5305 (29.0%)
analysis: 5211 (28.5%)
knowledge: 5210 (28.4%)
understanding: 5164 (28.2%)
key: 5151 (28.1%)
impact: 5124 (28.0%)
two: 5063 (27.6%)
management: 5061 (27.6%)
cost: 5047 (27.6%)
production: 5043 (27.5%)
materials: 4958 (27.1%)
technologies: 