**1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.**

In [None]:
import configparser

def display_core_components():
    # Create a ConfigParser object
    config = configparser.ConfigParser()

    # Read the Hadoop configuration file
    config.read('path/to/hadoop/conf/hadoop-core-site.xml')

    # Get the core components from the configuration
    core_components = config.get('core', 'fs.defaultFS')

    print("Core Components of Hadoop:")
    print(core_components)


# Call the function to display the core components
display_core_components()


**2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.**

In [None]:
from hdfs import InsecureClient

def calculate_total_file_size(hdfs_url, directory_path):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # Get a list of all files in the directory
    files = client.list(directory_path, status=True)

    # Calculate the total file size
    total_size = 0
    for file in files:
        if not file['isDirectory']:
            total_size += file['length']

    return total_size


# Example usage
hdfs_url = 'http://localhost:50070'
directory_path = '/user/hadoop/data'
total_size = calculate_total_file_size(hdfs_url, directory_path)
print(f"Total File Size: {total_size} bytes")


**3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.**

In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import heapq


class TopNWords(MRJob):
    def configure_args(self):
        super(TopNWords, self).configure_args()
        self.add_passthru_arg(
            "--N", type=int, default=10, help="Number of top words to retrieve"
        )

    def steps(self):
        return [
            MRStep(mapper=self.mapper, combiner=self.combiner, reducer=self.reducer),
            MRStep(reducer=self.top_n_reducer)
        ]

    def mapper(self, _, line):
        for word in line.split():
            yield word.lower(), 1

    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer(self, word, counts):
        yield None, (sum(counts), word)

    def top_n_reducer(self, _, word_count_pairs):
        N = self.options.N
        top_n = heapq.nlargest(N, word_count_pairs)
        for count, word in top_n:
            yield word, count

    def steps(self):
        return [
            MRStep(mapper=self.mapper, combiner=self.combiner, reducer=self.reducer),
            MRStep(reducer=self.top_n_reducer)
        ]


if __name__ == "__main__":
    TopNWords.run()

**4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.**

In [None]:
import requests

def check_hadoop_cluster_health(namenode_url):
    # Check the health status of the NameNode
    namenode_health_url = f"{namenode_url}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
    response = requests.get(namenode_health_url)
    namenode_health = response.json()['beans'][0]['State']

    print("NameNode Health Status:", namenode_health)

    # Check the health status of the DataNodes
    datanode_health_url = f"{namenode_url}/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo"
    response = requests.get(datanode_health_url)
    datanodes = response.json()['beans']

    print("DataNode Health Status:")
    for datanode in datanodes:
        hostname = datanode['HostAndPort']
        health = datanode['LastContact']
        print(f"DataNode: {hostname}, Last Contact: {health} ms")


# Example usage
namenode_url = 'http://localhost:50070'
check_hadoop_cluster_health(namenode_url)


**5. Develop a Python program that lists all the files and directories in a specific HDFS path.**

In [None]:
from hdfs import InsecureClient

def list_hdfs_path(hdfs_url, path):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # List files and directories in the HDFS path
    file_list = client.list(path, status=True)

    # Print the files and directories
    for file in file_list:
        file_name = file['path']
        is_directory = file['isDirectory']
        print(f"{'[DIR]' if is_directory else '[FILE]'} {file_name}")


# Example usage
hdfs_url = 'http://localhost:50070'
path = '/user/hadoop/data'
list_hdfs_path(hdfs_url, path)

**6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.**

In [None]:
import requests

def analyze_storage_utilization(namenode_url):
    # Get the DataNodes information from the Hadoop REST API
    datanodes_url = f"{namenode_url}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo"
    response = requests.get(datanodes_url)
    datanodes = response.json()['beans'][0]['LiveNodes']

    # Calculate storage utilization for each DataNode
    storage_utilization = {}
    for datanode in datanodes:
        hostname = datanodes[datanode]['HostName']
        capacity = datanodes[datanode]['Capacity']
        remaining = datanodes[datanode]['Remaining']
        utilization = (capacity - remaining) / capacity * 100
        storage_utilization[hostname] = utilization

    # Identify the DataNode with the highest storage capacity
    highest_capacity_node = max(storage_utilization, key=storage_utilization.get)
    highest_capacity = storage_utilization[highest_capacity_node]

    # Identify the DataNode with the lowest storage capacity
    lowest_capacity_node = min(storage_utilization, key=storage_utilization.get)
    lowest_capacity = storage_utilization[lowest_capacity_node]

    return storage_utilization, highest_capacity_node, highest_capacity, lowest_capacity_node, lowest_capacity


# Example usage
namenode_url = 'http://localhost:50070'
utilization, highest_node, highest_capacity, lowest_node, lowest_capacity = analyze_storage_utilization(namenode_url)

print("Storage Utilization:")
for node, utilization in utilization.items():
    print(f"{node}: {utilization}%")

print("\nDataNode with the highest storage capacity:")
print(f"Node: {highest_node}, Utilization: {highest_capacity}%")

print("\nDataNode with the lowest storage capacity:")
print(f"Node: {lowest_node}, Utilization: {lowest_capacity}%")

**7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.**

In [None]:
import requests
import time

def submit_hadoop_job(resource_manager_url, jar_path, class_name, input_path, output_path):
    # Submit the Hadoop job
    submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/new-application"
    response = requests.post(submit_url)
    application_id = response.json()['application-id']

    job_submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/{application_id}/app"
    payload = {
        'application-id': application_id,
        'application-name': 'Hadoop Job',
        'am-container-spec': {
            'commands': {
                'command': f"hadoop jar {jar_path} {class_name} {input_path} {output_path}"
            }
        },
        'application-type': 'MAPREDUCE'
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(job_submit_url, json=payload, headers=headers)

    print("Hadoop job submitted.")
    print("Application ID:", application_id)

    # Monitor job progress
    while True:
        status_url = f"{resource_manager_url}/ws/v1/cluster/apps/{application_id}/app"
        response = requests.get(status_url)
        status = response.json()['app']['finalStatus']
        if status in ['SUCCEEDED', 'FAILED', 'KILLED']:
            break
        time.sleep(5)

    # Retrieve job output
    if status == 'SUCCEEDED':
        output_url = f"{resource_manager_url}/ws/v1/cluster/apps/{application_id}/appMaster/log"
        response = requests.get(output_url)
        output = response.json()['log']['out']
        print("Hadoop job completed successfully. Output:")
        print(output)
    else:
        print("Hadoop job failed or was killed.")

# Example usage
resource_manager_url = 'http://localhost:8088'
jar_path = '/path/to/your/hadoop-job.jar'
class_name = 'com.example.HadoopJob'
input_path = '/input/path'
output_path = '/output/path'

submit_hadoop_job(resource_manager_url, jar_path, class_name, input_path, output_path)

**8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.**

In [None]:
import requests
import time

def submit_hadoop_job(resource_manager_url, jar_path, class_name, input_path, output_path, num_containers, container_memory, container_vcores):
    # Submit the Hadoop job
    submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/new-application"
    response = requests.post(submit_url)
    application_id = response.json()['application-id']

    job_submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/{application_id}/app"
    payload = {
        'application-id': application_id,
        'application-name': 'Hadoop Job',
        'am-container-spec': {
            'commands': {
                'command': f"hadoop jar {jar_path} {class_name} {input_path} {output_path}"
            },
            'resource': {
                'memory': container_memory,
                'vCores': container_vcores
            }
        },
        'application-type': 'MAPREDUCE',
        'resource': {
            'memory': container_memory,
            'vCores': container_vcores,
            'instances': num_containers
        }
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(job_submit_url, json=payload, headers=headers)

    print("Hadoop job submitted.")
    print("Application ID:", application_id)

    # Monitor job progress and resource usage
    while True:
        status_url = f"{resource_manager_url}/ws/v1/cluster/apps/{application_id}/app"
        response = requests.get(status_url)
        status = response.json()['app']['finalStatus']
        resources_used = response.json()['app']['amContainerLogs']['resourcesUsed']

        if status in ['SUCCEEDED', 'FAILED', 'KILLED']:
            break

        print(f"Resource Usage: {resources_used['memory']}MB Memory, {resources_used['vCores']} vCores")
        time.sleep(5)

    # Retrieve job output
    if status == 'SUCCEEDED':
        output_url = f"{resource_manager_url}/ws/v1/cluster/apps/{application_id}/appMaster/log"
        response = requests.get(output_url)
        output = response.json()['log']['out']
        print("Hadoop job completed successfully. Output:")
        print(output)
    else:
        print("Hadoop job failed or was killed.")

# Example usage
resource_manager_url = 'http://localhost:8088'
jar_path = '/path/to/your/hadoop-job.jar'
class_name = 'com.example.HadoopJob'
input_path = '/input/path'
output_path = '/output/path'
num_containers = 2
container_memory = 1024  # in MB
container_vcores = 1

submit_hadoop_job(resource_manager_url, jar_path, class_name, input_path, output_path, num_containers, container_memory, container_vcores)

**9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.**

In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import time

class MapReducePerformance(MRJob):
    def configure_args(self):
        super(MapReducePerformance, self).configure_args()
        self.add_passthru_arg("--split-size", type=int, default=64,
                              help="Input split size in megabytes")

    def mapper(self, _, line):
        yield None, line

    def reducer(self, _, lines):
        time.sleep(1)  # Simulate processing time
        for line in lines:
            yield _, line

    def steps(self):
        return [
            MRStep(mapper=self.mapper, reducer=self.reducer)
        ]

    def input_protocol(self):
        split_size_mb = self.options.split_size
        return 'raw_value_split', f'-D mapreduce.job.split.metainfo.maxsize={split_size_mb}M'

if __name__ == '__main__':
    MapReducePerformance.run()