**1.** Write a Python program to read a Hadoop configuration file and display the core
components of Hadoop.

In [None]:
from configparser import ConfigParser

def read_hadoop_config(config_file):
    config = ConfigParser()
    config.read(config_file)
    core_components = config.sections()
    return core_components

# Example usage
config_file = 'hadoop.conf'
components = read_hadoop_config(config_file)
print("Core Components of Hadoop:")
for component in components:
    print(component)

**2.** Implement a Python function that calculates the total file size in a Hadoop
Distributed File System (HDFS) directory.

In [None]:
import subprocess

def calculate_directory_size(directory):
    command = "hdfs dfs -du -s {}".format(directory)
    output = subprocess.check_output(command, shell=True).decode('utf-8').strip()
    size = int(output.split()[0])
    return size

# Example usage
directory = '/user/hadoop/data'
total_size = calculate_directory_size(directory)
print("Total file size in directory '{}': {} bytes".format(directory, total_size))

**3.** Create a Python program that extracts and displays the top N most frequent words
from a large text file using the MapReduce approach.

In [None]:
from mrjob.job import MRJob
from heapq import nlargest

class TopNWords(MRJob):

    def mapper(self, _, line):
        for word in line.strip().split():
            yield word, 1

    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer_init(self):
        self.top_n = 10
        self.heap = []

    def reducer(self, word, counts):
        total_count = sum(counts)
        if len(self.heap) < self.top_n:
            self.heap.append((total_count, word))
        else:
            min_count = min(self.heap)
            if total_count > min_count[0]:
                self.heap.remove(min_count)
                self.heap.append((total_count, word))

    def reducer_final(self):
        top_words = nlargest(self.top_n, self.heap)
        for count, word in top_words:
            yield word, count

# Example usage
input_file = 'large_text_file.txt'
mr_job = TopNWords(args=[input_file])
top_words = mr_job.run()

print("Top 10 most frequent words:")
for word, count in top_words:
    print(word, count)

**4.** Write a Python script that checks the health status of the NameNode and DataNodes
in a Hadoop cluster using Hadoop's REST API.

In [None]:
import requests

def check_health_status():
    nn_url = 'http://<namenode_hostname>:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus'
    dn_url = 'http://<datanode_hostname>:50075/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState'

    nn_response = requests.get(nn_url).json()
    dn_response = requests.get(dn_url).json()

    nn_status = nn_response['beans'][0]['State']
    dn_status = dn_response['beans'][0]['VolumeInfo'][0]['FailedVolumes']

    print("NameNode status:", nn_status)
    print("DataNode status:", dn_status)

# Example usage
check_health_status()

**5.** Develop a Python program that lists all the files and directories in a specific HDFS
path.

In [None]:
import subprocess

def list_hdfs_path(path):
    command = "hdfs dfs -ls -R {}".format(path)
    output = subprocess.check_output(command, shell=True).decode('utf-8').strip()
    files = output.split('\n')
    for file_info in files:
        print(file_info)

# Example usage
path = '/user/hadoop/data'
print("Contents of HDFS path '{}':".format(path))
list_hdfs_path(path)

**6.** Implement a Python program that analyzes the storage utilization of DataNodes in a
Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.

In [None]:
import requests

def analyze_storage_utilization():
    dn_url = 'http://<datanode_hostname>:50075/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState'

    dn_response = requests.get(dn_url).json()

    volumes = dn_response['beans'][0]['VolumeInfo']
    sorted_volumes = sorted(volumes, key=lambda x: x['usedSpace'], reverse=True)

    print("DataNodes with highest storage capacities:")
    for volume in sorted_volumes[:5]:
        print("Datanode:", volume['datanodeInfo'])
        print("Storage Capacity:", volume['capacity'])
        print("Used Space:", volume['usedSpace'])
        print("Free Space:", volume['freeSpace'])
        print()

    print("DataNodes with lowest storage capacities:")
    for volume in sorted_volumes[-5:]:
        print("Datanode:", volume['datanodeInfo'])
        print("Storage Capacity:", volume['capacity'])
        print("Used Space:", volume['usedSpace'])
        print("Free Space:", volume['freeSpace'])
        print()

# Example usage
analyze_storage_utilization()

**7.** Create a Python script that interacts with YARN's ResourceManager API to submit
a Hadoop job, monitor its progress, and retrieve the final output.

In [None]:
import requests
import time

def submit_and_monitor_job():
    submit_url = 'http://<resourcemanager_hostname>:8088/ws/v1/cluster/apps/new-application'
    submit_response = requests.post(submit_url)
    app_id = submit_response.json()['application-id']
    print("Job submitted. Application ID:", app_id)

    # Submit your job using the obtained application ID
    # ...

    # Monitor job progress
    while True:
        status_url = 'http://<resourcemanager_hostname>:8088/ws/v1/cluster/apps/{}'.format(app_id)
        status_response = requests.get(status_url)
        status = status_response.json()['app']['state']
        print("Job status:", status)

        if status == 'FINISHED':
            break
        elif status == 'FAILED':
            print("Job failed.")
            return

        time.sleep(10)  # Wait for 10 seconds before checking the status again

    # Retrieve the final output of the job
    output_url = 'http://<resourcemanager_hostname>:8088/ws/v1/cluster/apps/{}/finalStatus'.format(app_id)
    output_response = requests.get(output_url)
    final_output = output_response.json()['status']
    print("Final output:", final_output)

# Example usage
submit_and_monitor_job()

**8.** Create a Python script that interacts with YARN's ResourceManager API to submit
a Hadoop job, set resource requirements, and track resource usage during job
execution.

In [None]:
import requests
import time

def submit_and_track_resources():
    submit_url = 'http://<resourcemanager_hostname>:8088/ws/v1/cluster/apps/new-application'
    submit_response = requests.post(submit_url)
    app_id = submit_response.json()['application-id']
    print("Job submitted. Application ID:", app_id)

    # Set your resource requirements for the job
    resource_request = {
        "application-id": app_id,
        "resource": {
            "memory": 2048,
            "vCores": 2
        }
    }
    resource_url = 'http://<resourcemanager_hostname>:8088/ws/v1/cluster/apps/{}/resource-requests'.format(app_id)
    requests.post(resource_url, json=resource_request)

    # Submit your job using the obtained application ID
    # ...

    # Track resource usage during job execution
    while True:
        resource_usage_url = 'http://<resourcemanager_hostname>:8088/ws/v1/cluster/apps/{}/allocation'.format(app_id)
        resource_response = requests.get(resource_usage_url)
        resource_info = resource_response.json()

        # Process and print the resource information
        # ...

        status_url = 'http://<resourcemanager_hostname>:8088/ws/v1/cluster/apps/{}'.format(app_id)
        status_response = requests.get(status_url)
        status = status_response.json()['app']['state']
        print("Job status:", status)

        if status == 'FINISHED':
            break
        elif status == 'FAILED':
            print("Job failed.")
            return

        time.sleep(10)  # Wait for 10 seconds before checking the status and resource usage again

# Example usage
submit_and_track_resources()

**9.** Write a Python program that compares the performance of a MapReduce job with
different input split sizes, showcasing the impact on overall job execution time.

In [None]:
from mrjob.job import MRJob
import time

class SplitSizeComparison(MRJob):

    def configure_args(self):
        super(SplitSizeComparison, self).configure_args()
        self.add_passthru_arg('--split-size', type=int, default=64, help='Input split size in megabytes')

    def mapper(self, _, line):
        # Your mapper implementation
        pass

    def reducer(self, word, counts):
        # Your reducer implementation
        pass

# Example usage
input_file = 'large_text_file.txt'
split_sizes = [64, 128, 256, 512]

print("Comparing MapReduce job performance with different input split sizes:")

for split_size in split_sizes:
    start_time = time.time()

    mr_job = SplitSizeComparison(args=[input_file, '--split-size', split_size])
    with mr_job.make_runner() as runner:
        runner.run()

    elapsed_time = time.time() - start_time
    print("Input Split Size: {} MB, Elapsed Time: {:.2f} seconds".format(split_size, elapsed_time))