```
1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.
2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.
3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.
4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.
5. Develop a Python program that lists all the files and directories in a specific HDFS path.
6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.
7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.
8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.
9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.
```

In [None]:
def read_hadoop_config(config_file):
    hadoop_config = {}
    with open(config_file, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('dfs.namenode'):
                hadoop_config['NameNode'] = line.split('=')[1].strip()
            elif line.startswith('dfs.datanode'):
                hadoop_config['DataNode'] = line.split('=')[1].strip()
            elif line.startswith('mapreduce.framework.name'):
                hadoop_config['MapReduce'] = line.split('=')[1].strip()
            # Add more core components as needed
    return hadoop_config

# Example usage
config_file = 'hadoop_config.txt'
hadoop_components = read_hadoop_config(config_file)
print("Hadoop Core Components:")
for component, address in hadoop_components.items():
    print(component + ": " + address)

In [None]:
import subprocess

def calculate_directory_size(directory):
    cmd = "hadoop fs -du -s -h {}".format(directory)
    output = subprocess.check_output(cmd, shell=True).decode('utf-8')
    size = output.split()[0]
    return size

# Example usage
hdfs_directory = '/user/data'
total_size = calculate_directory_size(hdfs_directory)
print("Total file size in directory", hdfs_directory, "is", total_size)


In [None]:
from collections import Counter
import multiprocessing
import itertools

def mapper(chunk):
    words = chunk.split()
    return Counter(words)

def reducer(counters):
    return sum(counters, Counter())

def get_top_words(file_path, num_words):
    with open(file_path, 'r') as file:
        chunks = file.read().split()

    pool = multiprocessing.Pool()
    chunk_counts = pool.map(mapper, chunks)
    word_counts = reducer(chunk_counts)
    top_words = word_counts.most_common(num_words)
    return top_words

# Example usage
text_file = 'large_text_file.txt'
top_words_count = 10
top_words = get_top_words(text_file, top_words_count)
print("Top", top_words_count, "words:")
for word, count in top_words:
    print(word + ": " + str(count))


In [None]:
import requests

def check_namenode_status():
    response = requests.get('http://namenode:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus')
    data = response.json()
    return data['beans'][0]['State']

def check_datanode_status():
    response = requests.get('http://datanode:50075/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo')
    data = response.json()
    return data['beans'][0]['State']

# Example usage
namenode_status = check_namenode_status()
datanode_status = check_datanode_status()
print("NameNode status:", namenode_status)
print("DataNode status:", datanode_status)


In [None]:
import subprocess

def list_hdfs_path(hdfs_path):
    cmd = "hadoop fs -ls {}".format(hdfs_path)
    output = subprocess.check_output(cmd, shell=True).decode('utf-8')
    lines = output.split('\n')[1:-1]
    files_and_directories = [line.split()[-1] for line in lines]
    return files_and_directories

# Example usage
hdfs_path = '/user/data'
items = list_hdfs_path(hdfs_path)
print("Items in", hdfs_path + ":")
for item in items:
    print(item)


In [None]:
import requests

def analyze_datanode_storage():
    response = requests.get('http://datanode:50075/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState-UndefinedStorageId')
    data = response.json()
    volumes = data['beans'][0]['VolumeInfo']
    sorted_volumes = sorted(volumes, key=lambda x: x['usedSpace'], reverse=True)
    highest_storage = sorted_volumes[0]
    lowest_storage = sorted_volumes[-1]
    return highest_storage, lowest_storage

# Example usage
highest_storage, lowest_storage = analyze_datanode_storage()
print("DataNode with the highest storage capacity:")
print("Storage ID:", highest_storage['storageID'])
print("Used Space:", highest_storage['usedSpace'])
print("Remaining Space:", highest_storage['remainingSpace'])
print()
print("DataNode with the lowest storage capacity:")
print("Storage ID:", lowest_storage['storageID'])
print("Used Space:", lowest_storage['usedSpace'])
print("Remaining Space:", lowest_storage['remainingSpace'])


In [None]:
import requests
import time

def submit_hadoop_job(jar_file, input_path, output_path):
    submit_url = 'http://resourcemanager:8088/ws/v1/cluster/apps/new-application'
    response = requests.post(submit_url)
    data = response.json()
    application_id = data['application-id']

    submit_job_url = 'http://resourcemanager:8088/ws/v1/cluster/apps'
    headers = {'Content-Type': 'application/json'}
    payload = {
        'application-id': application_id,
        'application-name': 'My Hadoop Job',
        'am-container-spec': {
            'commands': {
                'command': 'hadoop jar {} {} {}'.format(jar_file, input_path, output_path)
            }
        },
        'application-type': 'MAPREDUCE'
    }
    response = requests.post(submit_job_url, headers=headers, json=payload)
    return application_id

def monitor_hadoop_job(application_id):
    status_url = 'http://resourcemanager:8088/ws/v1/cluster/apps/{}'.format(application_id)
    while True:
        response = requests.get(status_url)
        data = response.json()
        state = data['app']['state']
        if state == 'FINISHED':
            return True
        elif state in ['FAILED', 'KILLED']:
            return False
        time.sleep(5)

def retrieve_hadoop_job_output(application_id):
    output_url = 'http://resourcemanager:8088/ws/v1/cluster/apps/{}/appattempts'.format(application_id)
    response = requests.get(output_url)
    data = response.json()
    final_output = data['appAttempts']['appAttempt'][0]['logsLink']
    return final_output

# Example usage
jar_file = 'my_job.jar'
input_path = '/user/input
