# Install HDFS

In [None]:
!sudo apt-get update
!sudo apt-get install -y openjdk-11-jdk-headless

In [None]:
!readlink -f $(which java) | sed "s:bin/java::"

In [17]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:
!sudo apt-get install ssh
!sudo apt-get install pdsh

In [None]:
!wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz

In [None]:
!tar -xzvf hadoop-3.3.6.tar.gz
!sudo mv hadoop-3.3.6 /usr/local/hadoop

In [17]:

hadoop_env_path = '/usr/local/hadoop/etc/hadoop/hadoop-env.sh'

with open(hadoop_env_path, 'a') as f:
    f.write(f'\nexport JAVA_HOME={os.environ["JAVA_HOME"]}\n')

In [None]:
!/usr/local/hadoop/bin/hadoop

In [3]:
# Configure core-site.xml
core_site_xml = """
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost:9000</value>
    </property>
</configuration>
"""

with open('/usr/local/hadoop/etc/hadoop/core-site.xml', 'w') as file:
    file.write(core_site_xml)

# Configure hdfs-site.xml
hdfs_site_xml = """
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>
"""

with open('/usr/local/hadoop/etc/hadoop/hdfs-site.xml', 'w') as file:
    file.write(hdfs_site_xml)

In [20]:
!ssh localhost exit

In [None]:
!/usr/local/hadoop/bin/hdfs namenode -format

In [40]:
!/usr/local/hadoop/sbin/start-dfs.sh

Starting namenodes on [localhost]
Starting datanodes
Starting secondary namenodes [emma-Inspiron-3501]


In [41]:
!sudo jps

30897 NameNode
31524 Jps
31381 SecondaryNameNode
31110 DataNode
12599 SparkSubmit


In [25]:
!/usr/local/hadoop/bin/hdfs dfs -mkdir -p /user/OBIS

# Install Spark

In [None]:
!wget -v https://dlcdn.apache.org/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz

In [None]:
!tar xvf spark-3.4.3-bin-hadoop3.tgz

In [None]:
!mv spark-3.4.3-bin-hadoop3 /opt/spark

In [None]:
!pip3 install findspark --break-system-packages

In [23]:
import os
import findspark

# Set up environment variables
os.environ["SPARK_HOME"] = "/opt/spark"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
findspark.init()

# Add data to HDFS

In [42]:
!/usr/local/hadoop/bin/hadoop fs -ls /

Found 1 items
drwxr-xr-x   - emma supergroup          0 2024-10-23 18:07 /user


In [43]:
from pyspark.sql import SparkSession

spark = SparkSession.getActiveSession()

if spark is None:
    # No active session, create a new one
    spark = SparkSession.builder \
        .appName("OBIS") \
        .getOrCreate()
    print("New Spark session created.")
else:
    # An active session already exists
    print("Spark session is already running.")

New Spark session created.


In [28]:
# Path to the local CSV file
csv_file_path = "./obis_20230208.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

                                                                                

In [29]:
# Path to HDFS directory
hdfs_output_path = "hdfs://localhost:9000/user/OBIS/data"

# Write the DataFrame to HDFS
df.write.csv(hdfs_output_path, header=True)

24/10/23 18:16:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [44]:
# Define the HDFS directory path (adjust as necessary)
hdfs_directory_path = "hdfs://localhost:9000/user/OBIS/data"

# Read files from the HDFS directory into a DataFrame
try:
    df = spark.read.option("header", "true").csv(hdfs_directory_path)  # Change .csv to .parquet or .json as needed
    df.show(10)  # Display the DataFrame content
except Exception as e:
    print("Error reading from HDFS:", str(e))

                                                                                

+--------------------+--------------------+----------------+---------------+-------------+-------------+-------------+---------+--------------------+----------------------+--------------------+--------------------+-----+-----------------------------+----------+-------+-------+-------------+----------+-----+-----+------+--------+----------+-----------+---------+-------+----------------+-----------+------+--------+------------+------------+-------------+---------------+---------------------+---------+-----------+----------+---------+---------+--------------+------------+--------------------+-------------+-----------+---------------+---------------+------------+----------+---------+-----------+-----------------+------------------+----------+-----+--------+-----------+--------+-------+----------+------+--------------------+----------+-----+-------+----------+-----+--------+----+-------------------+--------+-------+------------+------------+---------------------+----------+-------------+---

# Get Stat about data

In [None]:
from pyspark.sql.functions import mean, stddev, countDistinct

# Compute statistics
species_count = df.select(countDistinct("scientificname")).alias("species_count")
mean_depth = df.select(mean("depth")).alias("mean_depth")
stddev_depth = df.select(stddev("depth")).alias("stddev_depth")

# Show results
species_count.show()
mean_depth.show()
stddev_depth.show()

In [23]:
species = species_count.collect()[0][0]
mean = mean_depth.collect()[0][0]
stddev = stddev_depth.collect()[0][0]

In [None]:
print(f"Species Count: {species}")
print(f"Mean Depth: {mean}")
print(f"Standard Deviation of Depth: {stddev}")

## Identification of biodiversity hotspots

In [32]:
from pyspark.sql.functions import col
cleaned_df = df.filter(col('decimalLatitude').isNotNull() & col('decimalLongitude').isNotNull() & col('scientificName').isNotNull())


In [33]:
from pyspark.sql.functions import col, countDistinct

grid_size = 1.0
species_richness_df = cleaned_df \
    .withColumn("lat_grid", (col("decimalLatitude") / grid_size).cast("int") * grid_size) \
    .withColumn("lon_grid", (col("decimalLongitude") / grid_size).cast("int") * grid_size) \
    .groupBy("lat_grid", "lon_grid") \
    .agg(countDistinct(col('scientificName')).alias('species_richness'))



In [34]:
pandas_df = species_richness_df.toPandas()

                                                                                

## Migratory phenomena

In [54]:
from pyspark.sql.functions import col
# Filter out rows with missing coordinates, species, year, or temperature data
filtered_df = df.filter(col('decimalLatitude').isNotNull() & 
                             col('decimalLongitude').isNotNull() & 
                             col('scientificName').isNotNull() & 
                             col('date_year').isNotNull() & 
                             col('sst').isNotNull())

In [58]:
filtered_df.select('sst').distinct().show()



+-----+
|  sst|
+-----+
|28.79|
|  8.5|
|13.87|
|17.42|
| 10.7|
|-1.77|
|26.63|
|28.37|
|19.06|
|20.64|
| 7.16|
|23.97|
|15.49|
| 20.5|
|11.83|
|12.85|
|26.25|
| 8.41|
| 2.43|
|18.96|
+-----+
only showing top 20 rows



                                                                                

In [60]:
# Classify regions based on sea surface temperature (sst)
def classify_temperature(sst):
    if sst is None:
        return 'unknown'  # Handle None values
    if sst < 10:
        return 'cold'
    elif 10 <= sst <= 25:
        return 'temperate'
    else:
        return 'warm'


from pyspark.sql.functions import udf, col
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DoubleType

filtered_df = filtered_df.withColumn(
    'sst', 
    F.when(F.col('sst').cast(DoubleType()).isNotNull(), F.col('sst').cast(DoubleType())).otherwise(None)
)

temperature_class_udf = udf(classify_temperature, StringType())
classified_df = filtered_df.withColumn('temperature_region', temperature_class_udf(col('sst')))


In [61]:
from pyspark.sql.functions import col, avg
# Calculate the average latitude and longitude for each species in each year
species_movement_df = classified_df.groupBy("scientificName", "date_year", "temperature_region") \
                                       .agg(avg(col('decimalLatitude')).alias('avg_latitude'),
                                            avg(col('decimalLongitude')).alias('avg_longitude'))

In [62]:
species_movement_pd = species_movement_df.toPandas()

                                                                                

# Output Database (MongoDB)

In [None]:
!pip3 install pymongo --break-system-packages

In [None]:
!pip3 install pandas --break-system-packages

In [11]:
from pymongo import MongoClient

# Connect to MongoDB
mongo_uri = "mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results"
client = MongoClient(mongo_uri)

### Insert stats

In [None]:
db = client["biodiversity_db"]
collection = db["statistics"]

# Create a document with the statistics
stats_document = {
    "species_count": species,
    "mean_depth": mean,
    "stddev_depth": stddev
}

# Insert the document into the collection
collection.insert_one(stats_document)
print("Statistics stored in MongoDB successfully.")

In [None]:
client.close()

### Insert biodiversity hotspots

In [35]:
hotspots_data = pandas_df[['lat_grid', 'lon_grid', 'species_richness']].to_dict(orient='records')

In [36]:
from pymongo import MongoClient
client = MongoClient("mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results")
db = client['biodiversity_db'] 
collection = db['biodiversity_hotspots']

# Insert hotspots data into MongoDB
collection.insert_many(hotspots_data)

InsertManyResult([ObjectId('67192caa9d2b8dbe6c0b26da'), ObjectId('67192caa9d2b8dbe6c0b26db'), ObjectId('67192caa9d2b8dbe6c0b26dc'), ObjectId('67192caa9d2b8dbe6c0b26dd'), ObjectId('67192caa9d2b8dbe6c0b26de'), ObjectId('67192caa9d2b8dbe6c0b26df'), ObjectId('67192caa9d2b8dbe6c0b26e0'), ObjectId('67192caa9d2b8dbe6c0b26e1'), ObjectId('67192caa9d2b8dbe6c0b26e2'), ObjectId('67192caa9d2b8dbe6c0b26e3'), ObjectId('67192caa9d2b8dbe6c0b26e4'), ObjectId('67192caa9d2b8dbe6c0b26e5'), ObjectId('67192caa9d2b8dbe6c0b26e6'), ObjectId('67192caa9d2b8dbe6c0b26e7'), ObjectId('67192caa9d2b8dbe6c0b26e8'), ObjectId('67192caa9d2b8dbe6c0b26e9'), ObjectId('67192caa9d2b8dbe6c0b26ea'), ObjectId('67192caa9d2b8dbe6c0b26eb'), ObjectId('67192caa9d2b8dbe6c0b26ec'), ObjectId('67192caa9d2b8dbe6c0b26ed'), ObjectId('67192caa9d2b8dbe6c0b26ee'), ObjectId('67192caa9d2b8dbe6c0b26ef'), ObjectId('67192caa9d2b8dbe6c0b26f0'), ObjectId('67192caa9d2b8dbe6c0b26f1'), ObjectId('67192caa9d2b8dbe6c0b26f2'), ObjectId('67192caa9d2b8dbe6c0b26

In [37]:
client.close()

## Insert migration

In [63]:
migration_data = species_movement_pd.to_dict(orient='records')

In [None]:
from pymongo import MongoClient
client = MongoClient("mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results")
db = client['biodiversity_db'] 
collection = db['migration']

# Insert hotspots data into MongoDB
collection.insert_many(migration_data)

In [None]:
client.close()

# Stop App

In [38]:
# Stop the Spark session
spark.stop()

In [1]:
!/usr/local/hadoop/sbin/stop-dfs.sh

Stopping namenodes on [localhost]
Stopping datanodes
Stopping secondary namenodes [emma-Inspiron-3501]
