# Install Java

In [None]:
!sudo apt-get update
!sudo apt-get install openjdk-11-jdk -y
!java -version

In [None]:
# Install HBase

In [None]:
!wget https://downloads.apache.org/hbase/2.5.10/hbase-2.5.10-bin.tar.gz
!tar xvf hbase-2.5.10-bin.tar.gz

In [None]:
!sudo mv hbase-2.5.10 /opt/hbase

In [2]:
# Update HBase environment configuration
!sed -i 's|# export JAVA_HOME=.*|export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64|' /opt/hbase/conf/hbase-env.sh

In [None]:
# Verify JAVA_HOME and set if necessary\n",
import os

# Check if JAVA_HOME is set, if not set it to the correct path\n",
java_home = '/usr/lib/jvm/java-11-openjdk-amd64'
if not os.path.exists(java_home):
    raise FileNotFoundError(f'Java path not found: {java_home}')

# Set JAVA_HOME and update PATH\n",
os.environ['JAVA_HOME'] = java_home
os.environ['PATH'] = java_home + '/bin:' + os.environ['PATH']

# Verify the java version to confirm setup\n",
!java -version

In [None]:
# Start HBase
import os

original_directory = os.getcwd()

os.chdir('/opt/hbase')
!bin/start-hbase.sh

os.chdir(original_directory)

In [None]:
!jps

In [None]:
# Create HBase Table
hbase_commands = '''
create 'my_table', 'cf1'
list
exit
'''

# Pass the commands to the HBase shell
with open('create_table.txt', 'w') as file:
    file.write(hbase_commands)

!cat create_table.txt | /opt/hbase/bin/hbase shell

# Install Spark

In [1]:
import os
os.chdir('/home/emma/Documents/KTH/ID2221/ID2221/Task 2/')

In [None]:
!wget https://dlcdn.apache.org/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz

In [None]:
!tar xvf spark-3.4.3-bin-hadoop3.tgz

In [None]:
!sudo mv spark-3.4.3-bin-hadoop3 /opt/spark

In [None]:
import os
import glob

# Set the SPARK_HOME environment variable
spark_home = '/opt/spark'
os.environ['SPARK_HOME'] = spark_home

# Update the PATH to include Spark's bin directory
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['SPARK_HOME'], 'bin')

# Find all .zip files in the Spark python/lib directory and set PYTHONPATH
zip_files = glob.glob(os.path.join(spark_home, 'python', 'lib', '*.zip'))
new_pythonpath = ':'.join(zip_files)
os.environ['PYTHONPATH'] = f"{new_pythonpath}:{os.environ.get('PYTHONPATH', '')}"

# Print environment variables to verify
print("SPARK_HOME:", os.environ['SPARK_HOME'])
print("PYTHONPATH:", os.environ['PYTHONPATH'])


In [2]:
!source ~/.bashrc

In [None]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName('PySparkTest') \
    .getOrCreate()

# Verify Spark session
print("Spark version:", spark.version)

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName('HBaseReader') \
    .config('spark.hadoop.hbase.zookeeper.quorum', 'localhost') \
    .config('spark.hadoop.hbase.master', 'localhost:16000') \
    .config('spark.hadoop.hbase.spark.sql.hbase.connection', 'localhost:2181') \
    .config('spark.hadoop.hbase.table', 'my_table') \
    .getOrCreate()

In [None]:
# Read the HBase table
df = spark.read.format('org.apache.hadoop.hbase.spark') \
    .option('hbase.table', 'my_table') \
    .option('hbase.zookeeper.quorum', 'localhost') \
    .option('hbase.zookeeper.property.clientPort', '2181') \
    .load()

# Show the DataFrame
df.show()

In [None]:
import os

original_directory = os.getcwd()

os.chdir('/opt/hbase')
!bin/stop-hbase.sh

os.chdir(original_directory)

In [None]:
!jps