In [2]:
import re
import numpy as np
from collections import Counter

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def count_sentences(text):
    sentences = re.split(r'[.!?]', text)
    return len([s for s in sentences if s.strip()])

def count_paragraphs(text):
    paragraphs = text.split('\n\n')
    return len([p for p in paragraphs if p.strip()])

def count_words_with_numpy(text):
    return np.char.count(text, ' ') + 1

def word_frequencies(words):
    dictionary = {}
    for word in words:
        word = word.strip(string.punctuation).lower()
        if word:
            dictionary[word] = dictionary.get(word, 0) + 1
    return dictionary

def text_analysis(file_path):
    stats = {
        'amount': 0,
        'length': 0,
        'word_count': {},
        'initial_count': {},
        'sentence_count': 0,
        'paragraph_count': 0
    }

    text = read_file(file_path)

    # Count sentences and paragraphs
    stats['sentence_count'] = count_sentences(text)
    stats['paragraph_count'] = count_paragraphs(text)

    # Count words using numpy
    stats['amount'] = int(count_words_with_numpy(text))

    # Process each line for word statistics
    words = re.findall(r'\b\w+\b', text)
    stats['length'] = sum(len(word) for word in words)

    # Count word frequencies
    stats['word_count'] = word_frequencies(words)

    # Count initial letters
    for word in words:
        initial = word[0].lower()
        stats['initial_count'][initial] = stats['initial_count'].get(initial, 0) + 1

    # Calculate average word length
    stats['average_length'] = stats['length'] / stats['amount'] if stats['amount'] else 0

    # Identify the most common words
    common_words = Counter(stats['word_count']).most_common(10)

    # Print the analysis
    print(f"Number of words: {stats['amount']}")
    print(f"Number of sentences: {stats['sentence_count']}")
    print(f"Number of paragraphs: {stats['paragraph_count']}")
    print(f"Average word length: {stats['average_length']:.2f}")
    print("Most common words and their frequencies:")
    for word, freq in common_words:
        print(f"{word}: {freq}")

# Example usage
file_path = '/content/sample_data/README.md'
text_analysis(file_path)



Number of words: 113
Number of sentences: 22
Number of paragraphs: 6
Average word length: 6.19
Most common words and their frequencies:
anscombe: 5
is: 4
https: 4
a: 3
data: 3
the: 3
com: 3
mnist: 3
vega_datasets: 3
sample: 2
