In [21]:
import os
import re
from collections import Counter

In [22]:
directory_path = r"C:\Users\arpit\Desktop\pytth\movies"
os.makedirs(directory_path, exist_ok=True)


In [23]:
mov = [
    "Inception is a mind-bending thriller with stunning visual effects and a complex narrative structure.",
    "The Godfather is a classic film that portrays the rise and fall of a mafia family in America.",
    "The Shawshank Redemption is an inspiring story of hope and friendship set in a prison.",
    "The Dark Knight features an iconic performance by Heath Ledger as the Joker in a gripping superhero film.",
    "Pulp Fiction is a nonlinear crime drama with sharp dialogue and memorable characters.",
    "The Lord of the Rings: The Return of the King is an epic conclusion to a beloved fantasy trilogy.",
    "Forrest Gump tells the story of a simple man with an extraordinary life, capturing the heart of America.",
    "The Matrix revolutionized the sci-fi genre with its groundbreaking special effects and philosophical themes.",
    "Fight Club is a provocative film that explores themes of identity and consumerism in modern society.",
    "Titanic is a tragic love story set against the backdrop of a historic maritime disaster."
]
for i, review in enumerate(mov, 1):
    file_path = os.path.join(directory_path, f"review_{i}.txt")
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(review)

Prompt 1 (given): Develop a Python program that 
-reads all the text files under a directory and returns the top-5 words with the most number of occurrences.

In [35]:
def read_files(directory_path):
    text_files = ""
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text_files += f.read().lower() + " "
    return text_files

def count_words(text):
    words = re.findall(r'\b\w+\b', text)
    return Counter(words)

In [37]:
def topfive_words(directory_path, top_n=5):
    all_text = read_files(directory_path)
    word_counts = count_words(all_text)
    return word_counts.most_common(top_n)


directory_path = r"C:\Users\arpit\Desktop\pytth\movies"  

top_words = topfive_words(directory_path)
print("Top-5 words with the most occurrences:")
for word, count in top_words:
    print(f"{word}: {count}")

Top-5 words with the most occurrences:
the: 14
a: 12
of: 8
is: 7
and: 6


Prompt 2: Can you parallelize it?

updating read_file to  read single file,get these allfiles path to count words
using parallel processing to count_words fuctions,



In [38]:
from concurrent.futures import ThreadPoolExecutor


In [39]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read().lower()
    
def get_files(directory_path):
    text_files = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".txt"):
                text_files.append(os.path.join(root, file))
    return text_files


In [41]:
def count_words(file_paths):
    text = ""
    with ThreadPoolExecutor() as executor:
        results = executor.map(read_file, file_paths)
        for result in results:
            text += result + " "
    
    words = re.findall(r'\b\w+\b', text)
    return Counter(words)

In [44]:
def topfive_words(directory_path, top_n=5):
    all_text = get_files(directory_path)
    word_counts = count_words(all_text)
    return word_counts.most_common(top_n)

top_words = topfive_words(directory_path)
print("Top-5 words with the most occurrences:")
for word, count in top_words:
    print(f"{word}: {count}")

Top-5 words with the most occurrences:
the: 14
a: 12
of: 8
is: 7
and: 6


Out of the top 5 words list, find the character count of each character.

In [49]:
def count_characters(words):
    char_count = Counter()
    for word in words:
        char_count.update(word)
    return char_count

top_words = [word for word, count in top_words]

character_counts = count_characters(top_words)
print("\nCharacter counts in the top-5 words:")
for char, count in character_counts.items():
    print(f"{char}: {count}")


Character counts in the top-5 words:
t: 1
h: 1
e: 1
a: 2
o: 1
f: 1
i: 1
s: 1
n: 1
d: 1
