# Install HDFS

In [None]:
!sudo apt-get update
!sudo apt-get install -y openjdk-11-jdk-headless

In [None]:
!readlink -f $(which java) | sed "s:bin/java::"

In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:
!sudo apt-get install ssh
!sudo apt-get install pdsh

In [None]:
!wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz

In [None]:
!tar -xzvf hadoop-3.3.6.tar.gz
!sudo mv hadoop-3.3.6 /usr/local/hadoop

In [17]:

hadoop_env_path = '/usr/local/hadoop/etc/hadoop/hadoop-env.sh'

with open(hadoop_env_path, 'a') as f:
    f.write(f'\nexport JAVA_HOME={os.environ["JAVA_HOME"]}\n')

In [None]:
!/usr/local/hadoop/bin/hadoop

In [3]:
# Configure core-site.xml
core_site_xml = """
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost:9000</value>
    </property>
</configuration>
"""

with open('/usr/local/hadoop/etc/hadoop/core-site.xml', 'w') as file:
    file.write(core_site_xml)

# Configure hdfs-site.xml
hdfs_site_xml = """
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>
"""

with open('/usr/local/hadoop/etc/hadoop/hdfs-site.xml', 'w') as file:
    file.write(hdfs_site_xml)

In [21]:
!ssh localhost exit

In [None]:
!/usr/local/hadoop/bin/hdfs namenode -format

In [5]:
!/usr/local/hadoop/sbin/start-dfs.sh

Starting namenodes on [localhost]
Starting datanodes
Starting secondary namenodes [emma-Inspiron-3501]


In [6]:
!sudo jps

113157 SecondaryNameNode
112679 NameNode
113286 Jps
54456 SparkSubmit
112895 DataNode


In [6]:
!/usr/local/hadoop/bin/hdfs dfs -mkdir -p /user/OBIS

# Install Spark

In [None]:
!wget -v https://dlcdn.apache.org/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz

In [None]:
!tar xvf spark-3.4.3-bin-hadoop3.tgz

In [None]:
!mv spark-3.4.3-bin-hadoop3 /opt/spark

In [None]:
!pip3 install findspark --break-system-packages

In [4]:
import os
import findspark

# Set up environment variables
os.environ["SPARK_HOME"] = "/opt/spark"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
findspark.init()

# Add data to HDFS

In [None]:
!/usr/local/hadoop/bin/hadoop fs -ls /

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.getActiveSession()

if spark is None:
    # No active session, create a new one
    spark = SparkSession.builder \
        .appName("OBIS") \
        .master("local[*]") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .getOrCreate()
    print("New Spark session created.")
else:
    # An active session already exists
    print("Spark session is already running.")

In [None]:
# Path to the local CSV file
csv_file_path = "./obis_20230208.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [None]:
# Path to HDFS directory
hdfs_output_path = "hdfs://localhost:9000/user/OBIS/data"

# Write the DataFrame to HDFS
df.write.csv(hdfs_output_path, header=True)

In [None]:
# Define the HDFS directory path (adjust as necessary)
hdfs_directory_path = "hdfs://localhost:9000/user/OBIS/data"

# Read files from the HDFS directory into a DataFrame
try:
    df = spark.read.option("header", "true").csv(hdfs_directory_path)  # Change .csv to .parquet or .json as needed
    df.show()  # Display the DataFrame content
except Exception as e:
    print("Error reading from HDFS:", str(e))

# Get Stat about data

In [None]:
from pyspark.sql.functions import mean, stddev, countDistinct

# Compute statistics
species_count = df.select(countDistinct("scientificname")).alias("species_count")
mean_depth = df.select(mean("depth")).alias("mean_depth")
stddev_depth = df.select(stddev("depth")).alias("stddev_depth")

# Show results
species_count.show()
mean_depth.show()
stddev_depth.show()

In [23]:
species = species_count.collect()[0][0]
mean = mean_depth.collect()[0][0]
stddev = stddev_depth.collect()[0][0]

In [None]:
print(f"Species Count: {species}")
print(f"Mean Depth: {mean}")
print(f"Standard Deviation of Depth: {stddev}")

# map the spatial distribution of species

In [22]:
from pyspark.sql.functions import round, count

# Round latitude and longitude to 2 decimal places for spatial resolution
df = df.withColumn('latitude_rounded', round(df['decimalLatitude'], 2)) \
       .withColumn('longitude_rounded', round(df['decimalLongitude'], 2))


In [23]:
# Group by species, rounded latitude, and rounded longitude
df_grouped = df.groupBy('scientificName', 'latitude_rounded', 'longitude_rounded') \
               .agg(count('*').alias('observation_count'))

In [None]:
df_grouped.show(10)

# Output Database (MongoDB)

In [None]:
!pip3 install pymongo --break-system-packages

In [None]:
!pip3 install pandas --break-system-packages

In [14]:
from pymongo import MongoClient

# Connect to MongoDB
mongo_uri = "mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results"
client = MongoClient(mongo_uri)

### Insert stats

In [None]:
db = client["biodiversity_db"]
collection = db["statistics"]

# Create a document with the statistics
stats_document = {
    "species_count": species,
    "mean_depth": mean,
    "stddev_depth": stddev
}

# Insert the document into the collection
collection.insert_one(stats_document)
print("Statistics stored in MongoDB successfully.")

### Insert spacial distribution

In [None]:
import json

# Convert the grouped DataFrame to JSON and then to a list of dictionaries
json_records = df_grouped.toJSON().collect()
records = [json.loads(record) for record in json_records]


In [None]:
collection = db['distribution_data']

# Insert the processed data into MongoDB
collection.insert_many(records)

# Stop App

In [None]:
# Stop the Spark session
spark.stop()

In [4]:
!/usr/local/hadoop/sbin/stop-dfs.sh

Stopping namenodes on [localhost]
Stopping datanodes
Stopping secondary namenodes [emma-Inspiron-3501]
