In [3]:
# 1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.
#Sol
import configparser

def display_core_components(config_file_path):
    config = configparser.ConfigParser()
    config.read(config_file_path)

    if 'core-site' in config:
        core_components = config['core-site'].get('fs.defaultFS', '').split(',')
        print("Core Components:")
        for component in core_components:
            print(component.strip())

# Example usage
config_file = '/path/to/hadoop/core-site.xml'
display_core_components(config_file)


In [None]:
# 2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.
#sol:
import pyarrow.hdfs

def calculate_directory_size(hdfs_host, hdfs_port, directory_path):
    hdfs_client = pyarrow.hdfs.connect(host=hdfs_host, port=hdfs_port)

    total_size = 0

    # Walk through the directory recursively and calculate the size of each file
    for file_info in hdfs_client.walk(directory_path):
        if file_info['kind'] == 'file':
            total_size += file_info['size']

    return total_size

# Example usage
hdfs_host = 'localhost'
hdfs_port = 9000
directory_path = '/path/to/hdfs/directory'

size = calculate_directory_size(hdfs_host, hdfs_port, directory_path)
print(f"Total file size: {size} bytes")


In [None]:
#3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.
#Sol:
from mrjob.job import MRJob
from mrjob.step import MRStep
import heapq

class TopNWords(MRJob):

    def configure_args(self):
        super(TopNWords, self).configure_args()
        self.add_passthru_arg('--topN', type=int, help='Number of top N words to display')

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_topN_words)
        ]

    def mapper_get_words(self, _, line):
        for word in line.strip().split():
            yield word.lower(), 1

    def combiner_count_words(self, word, counts):
        yield word, sum(counts)

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_topN_words(self, _, counts_words):
        N = self.options.topN
        topN_words = heapq.nlargest(N, counts_words)
        for count, word in topN_words:
            yield count, word

if __name__ == '__main__':
    TopNWords.run()


In [None]:
#4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.
#Sol:
import requests

# Define the base URL for the Hadoop REST API
hadoop_url = 'http://<hadoop_host>:<hadoop_port>'

def check_namenode_status():
    # Endpoint to check NameNode status
    namenode_url = f"{hadoop_url}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
    response = requests.get(namenode_url)
    data = response.json()

    # Extract the health status of the NameNode
    if 'beans' in data and len(data['beans']) > 0:
        status = data['beans'][0]['State']
        print(f"NameNode Health Status: {status}")
    else:
        print("Failed to retrieve NameNode status.")

def check_datanode_status():
    # Endpoint to check DataNode status
    datanode_url = f"{hadoop_url}/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo"
    response = requests.get(datanode_url)
    data = response.json()

    # Extract the health status of each DataNode
    if 'beans' in data and len(data['beans']) > 0:
        datanodes = data['beans'][0]['LiveNodes']
        print("DataNode Health Status:")
        for node in datanodes:
            status = datanodes[node]['AdminState']
            print(f"{node}: {status}")
    else:
        print("Failed to retrieve DataNode status.")

# Example usage
check_namenode_status()
print()
check_datanode_status()


In [None]:
#5. Develop a Python program that lists all the files and directories in a specific HDFS path.
#Sol:
import pyarrow.hdfs

def list_hdfs_path(hdfs_host, hdfs_port, hdfs_path):
    hdfs_client = pyarrow.hdfs.connect(host=hdfs_host, port=hdfs_port)

    # List all files and directories in the specified HDFS path
    file_info = hdfs_client.ls(hdfs_path, detail=True)

    # Print the file and directory names
    for item in file_info:
        name = item['name']
        is_directory = item['kind'] == 'directory'
        print(f"{name} (Directory: {is_directory})")

# Example usage
hdfs_host = 'localhost'
hdfs_port = 9000
hdfs_path = '/path/to/hdfs/directory'

list_hdfs_path(hdfs_host, hdfs_port, hdfs_path)


In [None]:
#6: Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.
#sol:
import requests

# Define the base URL for the Hadoop REST API
hadoop_url = 'http://<hadoop_host>:<hadoop_port>'

def analyze_storage_utilization():
    # Endpoint to get DataNode information
    datanodes_url = f"{hadoop_url}/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState"

    # Send a GET request to retrieve DataNode information
    response = requests.get(datanodes_url)
    data = response.json()

    if 'beans' in data and len(data['beans']) > 0:
        datanodes = data['beans'][0]['StorageInfo']
        storage_utilization = {}
        for node in datanodes:
            node_name = node['datanodeHostName']
            capacity = node['capacity']
            used = node['used']
            utilization = (used / capacity) * 100
            storage_utilization[node_name] = utilization

        # Identify the node with the highest storage capacity
        max_capacity_node = max(storage_utilization, key=storage_utilization.get)
        max_capacity = storage_utilization[max_capacity_node]

        # Identify the node with the lowest storage capacity
        min_capacity_node = min(storage_utilization, key=storage_utilization.get)
        min_capacity = storage_utilization[min_capacity_node]

        print("Storage Utilization:")
        for node, utilization in storage_utilization.items():
            print(f"{node}: {utilization:.2f}%")

        print(f"\nNode with highest storage capacity: {max_capacity_node} ({max_capacity:.2f}%)")
        print(f"Node with lowest storage capacity: {min_capacity_node} ({min_capacity:.2f}%)")
    else:
        print("Failed to retrieve DataNode information.")

# Example usage
analyze_storage_utilization()


In [None]:
# 7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.
#Sol:
import requests
import time

# Define the base URL for the YARN ResourceManager API
yarn_url = 'http://<yarn_host>:<yarn_port>'

def submit_hadoop_job(jar_path, input_path, output_path):
    # Endpoint to submit a Hadoop job
    submit_url = f"{yarn_url}/ws/v1/cluster/apps/new-application"

    # Send a POST request to create a new YARN application
    response = requests.post(submit_url)

    if response.status_code == 200:
        # Extract the application ID from the response
        application_id = response.json()['application-id']

        # Submit the Hadoop job using the obtained application ID
        submit_job_url = f"{yarn_url}/ws/v1/cluster/apps/{application_id}/app"
        payload = {
            "application-id": application_id,
            "application-name": "HadoopJob",
            "am-container-spec": {
                "commands": {
                    "command": f"hadoop jar {jar_path} <input_path> <output_path>",
                    "arguments": [input_path, output_path]
                }
            }
        }
        headers = {'Content-Type': 'application/json'}
        response = requests.post(submit_job_url, data=payload, headers=headers)

        if response.status_code == 202:
            print("Hadoop job submitted successfully.")
            return application_id

    print("Failed to submit Hadoop job.")
    return None

def monitor_hadoop_job(application_id):
    # Endpoint to get the status of a YARN application
    status_url = f"{yarn_url}/ws/v1/cluster/apps/{application_id}"

    # Monitor the Hadoop job progress until it is completed
    while True:
        response = requests.get(status_url)
        if response.status_code == 200:
            data = response.json()
            app_status = data['app']['finalStatus']
            if app_status in ['SUCCEEDED', 'FAILED', 'KILLED']:
                break
            print(f"Hadoop job is still running. Progress: {data['app']['progress']}")
        time.sleep(5)

    if app_status == 'SUCCEEDED':
        print("Hadoop job completed successfully.")
        return True
    else:
        print("Hadoop job failed or was killed.")
        return False

def retrieve_hadoop_job_output(application_id, output_path):
    # Endpoint to retrieve the output of a completed YARN application
    output_url = f"{yarn_url}/ws/v1/cluster/apps/{application_id}/appattempts"

    # Send a GET request to obtain the output of the Hadoop job
    response = requests.get(output_url)

    if response.status_code == 200:
        # Extract the output location from the response
        attempts = response.json()['appAttempts']['appAttempt']
        output_location = attempts[-1]['logsLink'] + output_path

        # Retrieve the output file contents
        response = requests.get(output_location)
        if response.status_code == 200:
            output = response.text
            print(f"Hadoop job output:\n{output}")
            return output

    print("Failed to retrieve Hadoop job output.")
    return None

# Example usage
jar_path = '/path/to/hadoop/job.jar'
input_path = '/input/path'
output_path = '/output/path'

application_id = submit_hadoop_job(jar_path, input_path, output_path)
if application_id:
    if monitor_hadoop_job(application_id):
        retrieve_hadoop_job_output(application_id, output_path)





In [None]:
# 8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.
#Sol:
import requests
import time

# Define the base URL for the YARN ResourceManager API
yarn_url = 'http://<yarn_host>:<yarn_port>'

def submit_hadoop_job(jar_path, input_path, output_path, vcores, memory):
    # Endpoint to submit a Hadoop job
    submit_url = f"{yarn_url}/ws/v1/cluster/apps/new-application"

    # Send a POST request to create a new YARN application
    response = requests.post(submit_url)

    if response.status_code == 200:
        # Extract the application ID from the response
        application_id = response.json()['application-id']

        # Submit the Hadoop job using the obtained application ID and set resource requirements
        submit_job_url = f"{yarn_url}/ws/v1/cluster/apps/{application_id}/app"
        payload = {
            "application-id": application_id,
            "application-name": "HadoopJob",
            "am-container-spec": {
                "commands": {
                    "command": f"hadoop jar {jar_path} <input_path> <output_path>",
                    "arguments": [input_path, output_path]
                },
                "resource": {
                    "vcores": vcores,
                    "memory": memory
                }
            }
        }
        headers = {'Content-Type': 'application/json'}
        response = requests.post(submit_job_url, json=payload, headers=headers)

        if response.status_code == 202:
            print("Hadoop job submitted successfully.")
            return application_id

    print("Failed to submit Hadoop job.")
    return None

def track_resource_usage(application_id):
    # Endpoint to get the status of a YARN application
    status_url = f"{yarn_url}/ws/v1/cluster/apps/{application_id}"

    # Track the resource usage of the Hadoop job until it is completed
    while True:
        response = requests.get(status_url)
        if response.status_code == 200:
            data = response.json()
            app_status = data['app']['finalStatus']
            if app_status in ['SUCCEEDED', 'FAILED', 'KILLED']:
                break
            resource = data['app']['allocatedResources']
            print(f"Allocated vCores: {resource['vCores']}, Allocated Memory: {resource['memory']}")
        time.sleep(5)

    if app_status == 'SUCCEEDED':
        print("Hadoop job completed successfully.")
        return True
    else:
        print("Hadoop job failed or was killed.")
        return False

# Example usage
jar_path = '/path/to/hadoop/job.jar'
input_path = '/input/path'
output_path = '/output/path'
vcores = 2
memory = 2048

application_id = submit_hadoop_job(jar_path, input_path, output_path, vcores, memory)
if application_id:
    if track_resource_usage(application_id):
        print("Resource tracking completed.")


In [None]:
# 9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.
#Sol:
from mrjob.job import MRJob
from mrjob.step import MRStep
import time

class WordCountJob(MRJob):

    def configure_args(self):
        super(WordCountJob, self).configure_args()
        self.add_passthru_arg('--split_size', type=int, default=128, help='Input split size in MB')

    def mapper(self, _, line):
        for word in line.strip().split():
            yield word.lower(), 1

    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer(self, word, counts):
        yield word, sum(counts)

    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer=self.reducer)
        ]

if __name__ == '__main__':
    start_time = time.time()

    # Example usage with different split sizes
    split_sizes = [128, 256, 512]

    for split_size in split_sizes:
        job = WordCountJob(args=['large_text_file.txt', f'--split_size={split_size}'])
        with job.make_runner() as runner:
            runner.run()

        elapsed_time = time.time() - start_time
        print(f"Split Size: {split_size} MB, Elapsed Time: {elapsed_time:.2f} seconds")
