# Data processing

In [1]:
import os

os.environ["SPARK_HOME"] = "/opt/spark"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [2]:
!/usr/local/hadoop/sbin/start-dfs.sh

Starting namenodes on [localhost]
Starting datanodes
Starting secondary namenodes [emma-Inspiron-3501]


In [6]:
!/usr/local/hadoop/bin/hadoop fs -ls /user/OBIS

Found 1 items
drwxr-xr-x   - emma supergroup          0 2024-10-23 18:56 /user/OBIS/data


In [7]:
import findspark

findspark.init()

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.getActiveSession()

if spark is None:
    # No active session, create a new one
    spark = SparkSession.builder \
        .appName("OBIS") \
        .getOrCreate()
    print("New Spark session created.")
else:
    # An active session already exists
    print("Spark session is already running.")

24/10/25 15:06:28 WARN Utils: Your hostname, emma-Inspiron-3501 resolves to a loopback address: 127.0.1.1; using 192.168.247.102 instead (on interface wlp0s20f3)
24/10/25 15:06:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/25 15:06:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


New Spark session created.


In [9]:
hdfs_directory_path = "hdfs://localhost:9000/user/OBIS/data"

try:
    df = spark.read.option("header", "true").csv(hdfs_directory_path)
except Exception as e:
    print("Error reading from HDFS:", str(e))

                                                                                

### Getting some stats

In [21]:
from pyspark.sql.functions import mean, stddev, countDistinct, count

species_count_per_species = df.groupBy("scientificname").agg(count("*").alias("count_per_species"))
species_count_per_species.show(10)



+--------------------+-----------------+
|      scientificname|count_per_species|
+--------------------+-----------------+
|  Rhina ancylostomus|              179|
|Argyripnus ephipp...|               34|
|    Photis parvidons|               25|
|Diastyloides bace...|               18|
|Peraclistus oophagus|                3|
|          Actinonema|              526|
|          Fistularia|              172|
|Spongicola andama...|               51|
|Callyspongia (Cla...|                1|
|Hydrobia substria...|                1|
+--------------------+-----------------+
only showing top 10 rows



                                                                                

### Biodiversity hotspots

In [None]:
from pyspark.sql.functions import col
cleaned_df = df.filter(col('decimalLatitude').isNotNull() & col('decimalLongitude').isNotNull() & col('scientificName').isNotNull())

In [None]:
from pyspark.sql.functions import col, countDistinct

grid_size = 1.0
species_richness_df = cleaned_df \
    .withColumn("lat_grid", (col("decimalLatitude") / grid_size).cast("int") * grid_size) \
    .withColumn("lon_grid", (col("decimalLongitude") / grid_size).cast("int") * grid_size) \
    .groupBy("lat_grid", "lon_grid") \
    .agg(countDistinct(col('scientificName')).alias('species_richness'))

In [None]:
pandas_df = species_richness_df.toPandas()

### Migratory phenomena

In [None]:
from pyspark.sql.functions import col
# Filter out rows with missing coordinates, species, year, or temperature data
filtered_df = df.filter(col('decimalLatitude').isNotNull() & 
                             col('decimalLongitude').isNotNull() & 
                             col('scientificName').isNotNull() & 
                             col('date_year').isNotNull() & 
                             col('sst').isNotNull())

In [None]:
# Classify regions based on sea surface temperature (sst)
def classify_temperature(sst):
    if sst is None:
        return 'unknown'
    if sst < 10:
        return 'cold'
    elif 10 <= sst <= 25:
        return 'temperate'
    else:
        return 'warm'


from pyspark.sql.functions import udf, col
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DoubleType

filtered_df = filtered_df.withColumn(
    'sst', 
    F.when(F.col('sst').cast(DoubleType()).isNotNull(), F.col('sst').cast(DoubleType())).otherwise(None)
)

temperature_class_udf = udf(classify_temperature, StringType())
classified_df = filtered_df.withColumn('temperature_region', temperature_class_udf(col('sst')))

In [None]:
from pyspark.sql.functions import col, avg
# Calculate the average latitude and longitude for each species in each year
species_movement_df = classified_df.groupBy("scientificName", "date_year", "temperature_region") \
                                       .agg(avg(col('decimalLatitude')).alias('avg_latitude'),
                                            avg(col('decimalLongitude')).alias('avg_longitude'))

In [None]:
species_movement_pd = species_movement_df.toPandas()

### Shannon indices

In [12]:
from pyspark.sql import functions as F

valid_data_df = df.filter(df["scientificName"].isNotNull() & df["date_year"].isNotNull())

# Calculate the number of species observed in each year
species_counts_per_year = valid_data_df.groupBy("date_year", "scientificName").count()
species_counts_per_year = species_counts_per_year.withColumnRenamed("count", "species_count")


# Calculate total species count per year
total_counts_per_year = species_counts_per_year.groupBy("date_year").agg(
    F.sum("species_count").alias("total_count_per_year")
)

# Join total counts back to the original species count DataFrame to calculate proportions
species_with_proportion = species_counts_per_year.join(
    total_counts_per_year, on="date_year"
).withColumn(
    "proportion", F.col("species_count") / F.col("total_count_per_year")
)

# Calculate Shannon component (-p_i * ln(p_i)) for each species within each year
species_with_shannon_component = species_with_proportion.withColumn(
    "shannon_component", -F.col("proportion") * F.log(F.col("proportion"))
)

# Sum the Shannon components per year to get the Shannon index for each year
shannon_indices = species_with_shannon_component.groupBy("date_year").agg(
    F.sum("shannon_component").alias("shannon_index")
)


# Store output datas in database (MongoDB)

### Stats

In [25]:
species_count_dict = species_count_per_species.toPandas().to_dict(orient='records')

                                                                                

In [26]:
from pymongo import MongoClient

# Connect to MongoDB
mongo_uri = "mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results"
client = MongoClient(mongo_uri)

db = client["biodiversity_db"]
collection = db["statistics"]

# Insert the document into the collection
collection.insert_many(species_count_dict)
print("Statistics stored in MongoDB successfully.")

Statistics stored in MongoDB successfully.


In [27]:
client.close()

### Hotspots

In [None]:
hotspots_data = pandas_df[['lat_grid', 'lon_grid', 'species_richness']].to_dict(orient='records')

In [None]:
from pymongo import MongoClient
client = MongoClient("mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results")
db = client['biodiversity_db'] 
collection = db['biodiversity_hotspots']

# Insert hotspots data into MongoDB
collection.insert_many(hotspots_data)

In [None]:
client.close()

### Migration

In [None]:
migration_data = species_movement_pd.to_dict(orient='records')

In [None]:
from pymongo import MongoClient
client = MongoClient("mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results")
db = client['biodiversity_db'] 
collection = db['migration']

# Insert migration data into MongoDB
collection.insert_many(migration_data)

In [None]:
client.close()

### Shannon indices

In [15]:
shannon_data = shannon_indices.toPandas().to_dict(orient='records')

                                                                                

In [16]:
from pymongo import MongoClient
# Connect to MongoDB
client = MongoClient("mongodb+srv://obis:obis-project@obis-results.cer3o.mongodb.net/?retryWrites=true&w=majority&appName=OBIS-results")
db = client['biodiversity_db'] 
collection = db['Shannon']

# Insert Shannon indices data into MongoDB
collection.insert_many(shannon_data)

InsertManyResult([ObjectId('671b9e7d069a19deb30240f5'), ObjectId('671b9e7d069a19deb30240f6'), ObjectId('671b9e7d069a19deb30240f7'), ObjectId('671b9e7d069a19deb30240f8'), ObjectId('671b9e7d069a19deb30240f9'), ObjectId('671b9e7d069a19deb30240fa'), ObjectId('671b9e7d069a19deb30240fb'), ObjectId('671b9e7d069a19deb30240fc'), ObjectId('671b9e7d069a19deb30240fd'), ObjectId('671b9e7d069a19deb30240fe'), ObjectId('671b9e7d069a19deb30240ff'), ObjectId('671b9e7d069a19deb3024100'), ObjectId('671b9e7d069a19deb3024101'), ObjectId('671b9e7d069a19deb3024102'), ObjectId('671b9e7d069a19deb3024103'), ObjectId('671b9e7d069a19deb3024104'), ObjectId('671b9e7d069a19deb3024105'), ObjectId('671b9e7d069a19deb3024106'), ObjectId('671b9e7d069a19deb3024107'), ObjectId('671b9e7d069a19deb3024108'), ObjectId('671b9e7d069a19deb3024109'), ObjectId('671b9e7d069a19deb302410a'), ObjectId('671b9e7d069a19deb302410b'), ObjectId('671b9e7d069a19deb302410c'), ObjectId('671b9e7d069a19deb302410d'), ObjectId('671b9e7d069a19deb30241

In [17]:
client.close()

# Stop app

In [None]:
spark.stop()

In [5]:
!/usr/local/hadoop/sbin/stop-dfs.sh

Stopping namenodes on [localhost]
Stopping datanodes
Stopping secondary namenodes [emma-Inspiron-3501]
