# Install HDFS

In [None]:
!sudo apt-get update
!sudo apt-get install -y openjdk-11-jdk-headless

In [None]:
!readlink -f $(which java) | sed "s:bin/java::"

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:
!sudo apt-get install ssh
!sudo apt-get install pdsh

In [None]:
!wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz

In [None]:
!tar -xzvf hadoop-3.3.6.tar.gz
!sudo mv hadoop-3.3.6 /usr/local/hadoop

In [17]:

hadoop_env_path = '/usr/local/hadoop/etc/hadoop/hadoop-env.sh'

with open(hadoop_env_path, 'a') as f:
    f.write(f'\nexport JAVA_HOME={os.environ["JAVA_HOME"]}\n')

In [None]:
!/usr/local/hadoop/bin/hadoop

In [3]:
# Configure core-site.xml
core_site_xml = """
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost:9000</value>
    </property>
</configuration>
"""

with open('/usr/local/hadoop/etc/hadoop/core-site.xml', 'w') as file:
    file.write(core_site_xml)

# Configure hdfs-site.xml
hdfs_site_xml = """
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>
"""

with open('/usr/local/hadoop/etc/hadoop/hdfs-site.xml', 'w') as file:
    file.write(hdfs_site_xml)

In [21]:
!ssh localhost exit

In [None]:
!/usr/local/hadoop/bin/hdfs namenode -format

In [3]:
!/usr/local/hadoop/sbin/start-dfs.sh

Starting namenodes on [localhost]
Starting datanodes
Starting secondary namenodes [emma-Inspiron-3501]


In [4]:
!sudo jps

123968 NameNode
124448 SecondaryNameNode
124183 DataNode
124582 Jps
54456 SparkSubmit


In [6]:
!/usr/local/hadoop/bin/hdfs dfs -mkdir -p /user/OBIS

# Install Spark

In [None]:
!wget -v https://dlcdn.apache.org/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz

In [None]:
!tar xvf spark-3.4.3-bin-hadoop3.tgz

In [None]:
!mv spark-3.4.3-bin-hadoop3 /opt/spark

In [None]:
!pip3 install findspark --break-system-packages

In [5]:
import os
import findspark

# Set up environment variables
os.environ["SPARK_HOME"] = "/opt/spark"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
findspark.init()

# Add data to HDFS

In [6]:
!/usr/local/hadoop/bin/hadoop fs -ls /

Found 1 items
drwxr-xr-x   - emma supergroup          0 2024-10-22 23:34 /user


In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.getActiveSession()

if spark is None:
    # No active session, create a new one
    spark = SparkSession.builder \
        .appName("OBIS") \
        .getOrCreate()
    print("New Spark session created.")
else:
    # An active session already exists
    print("Spark session is already running.")

24/10/23 13:09:34 WARN Utils: Your hostname, emma-Inspiron-3501 resolves to a loopback address: 127.0.1.1; using 192.168.10.220 instead (on interface wlp0s20f3)
24/10/23 13:09:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/23 13:09:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


New Spark session created.


In [None]:
# Path to the local CSV file
csv_file_path = "./obis_20230208.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [None]:
# Path to HDFS directory
hdfs_output_path = "hdfs://localhost:9000/user/OBIS/data"

# Write the DataFrame to HDFS
df.write.csv(hdfs_output_path, header=True)

In [8]:
# Define the HDFS directory path (adjust as necessary)
hdfs_directory_path = "hdfs://localhost:9000/user/OBIS/data"

# Read files from the HDFS directory into a DataFrame
try:
    df = spark.read.option("header", "true").csv(hdfs_directory_path)  # Change .csv to .parquet or .json as needed
    # df.show()  # Display the DataFrame content
except Exception as e:
    print("Error reading from HDFS:", str(e))

                                                                                

# Get Stat about data

In [None]:
from pyspark.sql.functions import mean, stddev, countDistinct

# Compute statistics
species_count = df.select(countDistinct("scientificname")).alias("species_count")
mean_depth = df.select(mean("depth")).alias("mean_depth")
stddev_depth = df.select(stddev("depth")).alias("stddev_depth")

# Show results
species_count.show()
mean_depth.show()
stddev_depth.show()

In [23]:
species = species_count.collect()[0][0]
mean = mean_depth.collect()[0][0]
stddev = stddev_depth.collect()[0][0]

In [None]:
print(f"Species Count: {species}")
print(f"Mean Depth: {mean}")
print(f"Standard Deviation of Depth: {stddev}")

# Identification of biodiversity hotspots

In [31]:
from pyspark.sql.functions import col
cleaned_df = df.filter(col('decimalLatitude').isNotNull() & col('decimalLongitude').isNotNull() & col('scientificName').isNotNull())


In [35]:
from pyspark.sql.functions import col, countDistinct

grid_size = 5.0
species_richness_df = cleaned_df \
    .withColumn("lat_grid", (col("decimalLatitude") / grid_size).cast("int") * grid_size) \
    .withColumn("lon_grid", (col("decimalLongitude") / grid_size).cast("int") * grid_size) \
    .groupBy("lat_grid", "lon_grid") \
    .agg(countDistinct(col('scientificName')).alias('species_richness'))



In [36]:
pandas_df = species_richness_df.toPandas()

                                                                                

# Output Database (MongoDB)

In [None]:
!pip3 install pymongo --break-system-packages

In [None]:
!pip3 install pandas --break-system-packages

In [11]:
from pymongo import MongoClient

# Connect to MongoDB
mongo_uri = "mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results"
client = MongoClient(mongo_uri)

### Insert stats

In [None]:
db = client["biodiversity_db"]
collection = db["statistics"]

# Create a document with the statistics
stats_document = {
    "species_count": species,
    "mean_depth": mean,
    "stddev_depth": stddev
}

# Insert the document into the collection
collection.insert_one(stats_document)
print("Statistics stored in MongoDB successfully.")

In [None]:
client.close()

### Insert biodiversity hotspots

In [37]:
hotspots_data = pandas_df[['lat_grid', 'lon_grid', 'species_richness']].to_dict(orient='records')

In [38]:
from pymongo import MongoClient
client = MongoClient("mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results")
db = client['biodiversity_db']  # Name your database
collection = db['biodiversity_hotspots']

# Insert hotspots data into MongoDB
collection.insert_many(hotspots_data)

InsertManyResult([ObjectId('6718fa3bb7508a56aa5ce363'), ObjectId('6718fa3bb7508a56aa5ce364'), ObjectId('6718fa3bb7508a56aa5ce365'), ObjectId('6718fa3bb7508a56aa5ce366'), ObjectId('6718fa3bb7508a56aa5ce367'), ObjectId('6718fa3bb7508a56aa5ce368'), ObjectId('6718fa3bb7508a56aa5ce369'), ObjectId('6718fa3bb7508a56aa5ce36a'), ObjectId('6718fa3bb7508a56aa5ce36b'), ObjectId('6718fa3bb7508a56aa5ce36c'), ObjectId('6718fa3bb7508a56aa5ce36d'), ObjectId('6718fa3bb7508a56aa5ce36e'), ObjectId('6718fa3bb7508a56aa5ce36f'), ObjectId('6718fa3bb7508a56aa5ce370'), ObjectId('6718fa3bb7508a56aa5ce371'), ObjectId('6718fa3bb7508a56aa5ce372'), ObjectId('6718fa3bb7508a56aa5ce373'), ObjectId('6718fa3bb7508a56aa5ce374'), ObjectId('6718fa3bb7508a56aa5ce375'), ObjectId('6718fa3bb7508a56aa5ce376'), ObjectId('6718fa3bb7508a56aa5ce377'), ObjectId('6718fa3bb7508a56aa5ce378'), ObjectId('6718fa3bb7508a56aa5ce379'), ObjectId('6718fa3bb7508a56aa5ce37a'), ObjectId('6718fa3bb7508a56aa5ce37b'), ObjectId('6718fa3bb7508a56aa5ce3

In [39]:
client.close()

# Stop App

In [None]:
# Stop the Spark session
spark.stop()

In [None]:
!/usr/local/hadoop/sbin/stop-dfs.sh