In [1]:
import re
from collections import Counter

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

def analyze_speech(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    cleaned_text = clean_text(text)

    words = re.findall(r'\b\w+\b', cleaned_text)
    sentences = re.split(r'[.!?]', text)

    total_word_count = len(words)

    total_character_count = sum(len(word) for word in words)

    avg_word_length = total_character_count / total_word_count if total_word_count > 0 else 0

    avg_sentence_length = total_word_count / len(sentences) if len(sentences) > 0 else 0

    word_distribution = Counter(words)

    unique_words = set(words)
    longest_words = sorted(unique_words, key=len, reverse=True)[:10]

    return {
        "total_word_count": total_word_count,
        "total_character_count": total_character_count,
        "avg_word_length": avg_word_length,
        "avg_sentence_length": avg_sentence_length,
        "word_distribution": word_distribution,
        "longest_words": longest_words,
    }

def display_results(results):
    """Display the analysis results."""
    print("Speech Analysis Results")
    print("========================")
    print(f"Total Word Count: {results['total_word_count']}")
    print(f"Total Character Count: {results['total_character_count']}")
    print(f"Average Word Length: {results['avg_word_length']:.2f}")
    print(f"Average Sentence Length: {results['avg_sentence_length']:.2f} words\n")
    
    print("Top 10 Longest Words:")
    for word in results['longest_words']:
        print(word)
    
    print("\nWord Frequency Distribution (Top 20):")
    for word, count in results['word_distribution'].most_common(20):
        print(f"{word}: {count}")

input_file = "state_of_the_union.txt" 

results = analyze_speech(input_file)

display_results(results)

Speech Analysis Results
Total Word Count: 2863
Total Character Count: 14165
Average Word Length: 4.95
Average Sentence Length: 34.49 words

Top 10 Longest Words:
representatives
disappointments
establishments
procrastinated
communications
michelimackina
appropriations
extinguishment
administration
inconveniences

Word Frequency Distribution (Top 20):
the: 262
of: 192
to: 112
and: 94
in: 67
a: 57
be: 40
for: 35
that: 31
our: 28
which: 26
it: 26
with: 23
is: 23
will: 21
have: 20
by: 20
i: 19
as: 19
on: 19
