# Install HDFS

In [None]:
!sudo apt-get update
!sudo apt-get install -y openjdk-11-jdk-headless

In [None]:
!readlink -f $(which java) | sed "s:bin/java::"

In [17]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:
!sudo apt-get install ssh
!sudo apt-get install pdsh

In [None]:
!wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz

In [None]:
!tar -xzvf hadoop-3.3.6.tar.gz
!sudo mv hadoop-3.3.6 /usr/local/hadoop

In [17]:

hadoop_env_path = '/usr/local/hadoop/etc/hadoop/hadoop-env.sh'

with open(hadoop_env_path, 'a') as f:
    f.write(f'\nexport JAVA_HOME={os.environ["JAVA_HOME"]}\n')

In [None]:
!/usr/local/hadoop/bin/hadoop

In [3]:
# Configure core-site.xml
core_site_xml = """
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost:9000</value>
    </property>
</configuration>
"""

with open('/usr/local/hadoop/etc/hadoop/core-site.xml', 'w') as file:
    file.write(core_site_xml)

# Configure hdfs-site.xml
hdfs_site_xml = """
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>
"""

with open('/usr/local/hadoop/etc/hadoop/hdfs-site.xml', 'w') as file:
    file.write(hdfs_site_xml)

In [21]:
!ssh localhost exit

In [None]:
!/usr/local/hadoop/bin/hdfs namenode -format

In [4]:
!/usr/local/hadoop/sbin/start-dfs.sh

Starting namenodes on [localhost]
Starting datanodes
Starting secondary namenodes [emma-Inspiron-3501]


In [5]:
!sudo jps

58391 SecondaryNameNode
58616 Jps
54456 SparkSubmit
57931 NameNode
58140 DataNode


In [6]:
!/usr/local/hadoop/bin/hdfs dfs -mkdir -p /user/OBIS

# Install Spark

In [None]:
!wget -v https://dlcdn.apache.org/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz

In [None]:
!tar xvf spark-3.4.3-bin-hadoop3.tgz

In [None]:
!mv spark-3.4.3-bin-hadoop3 /opt/spark

In [None]:
!pip3 install findspark --break-system-packages

In [15]:
import os
import findspark

# Set up environment variables
os.environ["SPARK_HOME"] = "/opt/spark"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
findspark.init()

# Add data to HDFS

In [10]:
!/usr/local/hadoop/bin/hadoop fs -ls /

Found 1 items
drwxr-xr-x   - emma supergroup          0 2024-10-22 23:34 /user


In [18]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Read HDFS Directory") \
    .getOrCreate()

24/10/22 23:40:48 WARN Utils: Your hostname, emma-Inspiron-3501 resolves to a loopback address: 127.0.1.1; using 192.168.10.220 instead (on interface wlp0s20f3)
24/10/22 23:40:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/22 23:40:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [19]:
# Path to the local CSV file
csv_file_path = "./obis_20230208.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

                                                                                

In [22]:
# Path to HDFS directory
hdfs_output_path = "hdfs://localhost:9000/user/OBIS/data"

# Write the DataFrame to HDFS
df.write.csv(hdfs_output_path, header=True)

                                                                                

In [23]:
# Define the HDFS directory path (adjust as necessary)
hdfs_directory_path = "hdfs://localhost:9000/user/OBIS/data"

# Read files from the HDFS directory into a DataFrame
try:
    df = spark.read.option("header", "true").csv(hdfs_directory_path)  # Change .csv to .parquet or .json as needed
    df.show()  # Display the DataFrame content
except Exception as e:
    print("Error reading from HDFS:", str(e))

                                                                                

+--------------------+--------------------+----------------+---------------+-------------+-------------+-------------+---------+--------------------+----------------------+--------------------+--------------------+-------+-----------------------------+--------------------+-------+-------+-------------+----------+-----+-----+------+--------+----------+-----------+---------+-------+----------------+-----------+------+---------+------------+------------+-------------+---------------+---------------------+----------+-------------+------------+--------------+---------+--------------+-----------------+--------------------+-------------+-----------+---------------+---------------+------------+----------+---------+-----------+-----------------+------------------+----------+-----+--------+------------+--------+-------+----------+------+--------------------+----------+-----+-------+----------+-----+--------+----+--------------------+--------+-------+------------+------------+--------------------

# Stop App

In [24]:
# Stop the Spark session
spark.stop()

In [None]:
!/usr/local/hadoop/sbin/stop-dfs.sh