# Volcano Eruptions Dataset for MongoDB Assignment

### By Callum O'Brien, C21306503

In [None]:
import pandas as pd
from pymongo import MongoClient
import json
from bson.son import SON
import copy

df = pd.read_csv('eruptions.csv', sep=',', delimiter=None, encoding='UTF-8')

The column names in this dataset contain spaces and as such need to be replaced. 
I replace them here with underscores.

In [None]:
# Replace column name spaces with underscores
df.columns = list(map(lambda x: x.replace(" ", "_"), df.columns))
print(df.dtypes)

One column is called Elevation_(m). I will rename this to just 'Elevation'.

In [None]:
df = df.rename(columns={"Elevation_(m)": "Elevation"})
print(df.columns)

Next I clean the data, looking for null/N/A values.

In [None]:
print("Volcano_Number", df.Volcano_Number.isnull().sum())
print("Volcano_Name", df.Volcano_Name.isnull().sum())
print("Country", df.Country.isnull().sum())
print("Primary_Volcano_Type", df.Primary_Volcano_Type.isnull().sum())
print("Activity_Evidence", df.Activity_Evidence.isnull().sum())
print("Last_Known_Eruption", df.Last_Known_Eruption.isnull().sum())
print("Region", df.Region.isnull().sum())
print("Subregion", df.Subregion.isnull().sum())
print("Latitude", df.Latitude.isnull().sum())
print("Longitude", df.Longitude.isnull().sum())
print("Elevation", df.Elevation.isnull().sum())
print("Dominant_Rock_Type", df.Dominant_Rock_Type.isnull().sum())
print("Tectonic_Setting", df.Tectonic_Setting.isnull().sum())

In [None]:
print(df[df.Dominant_Rock_Type.isnull()])
print(df[df.Tectonic_Setting.isnull()])

In [None]:
df['Dominant_Rock_Type'].fillna("Not Specified", inplace=True)
df['Tectonic_Setting'].fillna("Not Specified", inplace=True)

print(df[['Volcano_Number',
          'Volcano_Name',
          'Country',
          'Primary_Volcano_Type',
          'Activity_Evidence',
          'Last_Known_Eruption', 
          'Region', 
          'Subregion',
          'Latitude', 
          'Longitude',
          'Elevation',
          'Dominant_Rock_Type',
          'Tectonic_Setting']].isnull().values.any())

I'll check if Country, Region and Subregion are 1:1.

In [None]:
for i in sorted(df.Country.unique()):
     if len(df[df.Country==i].Region.unique())!=1:
        print(i,"Not 1 region")
     if len(df[df.Country==i].Subregion.unique())!=1:
        print(i,"Not 1 subregion", len(df[df.Country==i].Subregion.unique()))

The country, region and subregion are not 1:1, so I will instead base the structure of the document on the Country itself.

In [None]:
volcano_df = df[['Country']].drop_duplicates()
print(f"\n{volcano_df.Country}\n")
print(f"\n{volcano_df.describe(include='all')}\n")
print(f"\n{volcano_df.isnull().values.any()}\n")

The Country list contains 95 unique values and no null values, and so is suitable for use.

In [None]:
uri = uri = 'mongodb://admin:Sp00ky!@localhost:27017/?AuthSource=admin'
client = MongoClient(uri)

Validating the data structure about to be inserted.

In [None]:
mydb = client['Volcano']
mycol = mydb['Country']
if "Country" in mydb.list_collection_names():
    mycol.drop()

mycol = mydb.create_collection(
    name="Country",
    validator={"$jsonSchema": {
        "required": ["Country"],
        "properties": {
            "Country": {
                "bsonType": "string",
                "pattern": "[a-z0-9]{2,20}",
                "description": "Name of country. 2 to 20 char"
            },
        }
    }},
    validationAction="error",
)

In [None]:
for row in volcano_df.itertuples():
    eruptions = df[df.Country == row.Country][['Volcano_Number',
                                               'Volcano_Name',
                                               'Primary_Volcano_Type',
                                               'Region',
                                               'Subregion',
                                               'Activity_Evidence',
                                               'Last_Known_Eruption',
                                               'Latitude',
                                               'Longitude',
                                               'Elevation',
                                               'Dominant_Rock_Type',
                                               'Tectonic_Setting']]
    
    entries = json.dumps({"Country": row.Country,
                          "Eruptions": eruptions.to_dict('records')})

    x = mycol.insert_one(json.loads(entries))



# Queries

1. All documents in JSON format: For a nicer output, the object ID is excluded from the result

In [None]:
for row in mycol.find({}, {"_id": 0}):
    print(json.dumps(row, indent = 2))

2. Embedded Array Data: All eruptions found in the Western Europe subregion.

In [None]:
for row in mycol.find({"Eruptions.Subregion": "Western Europe"}, {"_id": 0}):
    print(json.dumps(row, indent = 2))

3. Selection showing Projection: The name of all volcanoes where an eruption was observed.

In [None]:
for row in mycol.find({"Eruptions.Activity_Evidence": "Eruption Observed"}, {"_id": 0, "Eruptions.Volcano_Name": 1}):
    print(json.dumps(row, indent = 2))

4. Selection with Sorted Output using Aggregated Pipelines: Volcanoes in New Zealand, sorted by their Elevation

In [None]:
pipeline = [
    {"$match": {"Eruptions.Subregion": "New Zealand"}},
    {"$unwind": "$Eruptions"},
    {"$sort": SON([("Eruptions.Elevation", 1)])},
    {"$project": {"_id": 0, "Eruptions.Volcano_Name": 1, "Eruptions.Elevation": 1}},
]

result = mycol.aggregate(pipeline)

for row in result:
    print(json.dumps(row, indent=2))

# Updating a Document

The volcano on La Palma in the Canary Islands erupted in 2021. I'll update the data to reflect that.

In [None]:
filter_criteria = {"Eruptions.Volcano_Name": "La Palma"}
update_criteria = {"$set": {"Eruptions.$.Last_Known_Eruption": "2021 CE"}}

# Checking if the document exists
existing_document = mycol.find_one(filter_criteria)

if existing_document:
    mycol.update_one(filter_criteria, update_criteria)
    print("Document updated successfully.")
else:
    print("Document not found.")

result = mycol.find_one(
    {"Eruptions.Volcano_Name": "La Palma"},
    {"_id": 0, "Eruptions.$": 1}
)

print(json.dumps(result, indent=2))

# Deleting a Document

In [None]:
filter_criteria = {"Eruptions.Volcano_Name": "Porak"}
existing_document = mycol.find_one(filter_criteria)

if existing_document:
    mycol.delete_one(filter_criteria)
    print("Document deleted successfully.")
else:
    print("Document not found.")

# Inserting a Document

There already exists an element in the subarray for Mount Etna, as it erupted in 2018. It also erupted in 2023, so I will copy the existing details and update the copy with the new details.

In [None]:
filter_criteria = {"Eruptions.Volcano_Name": "Etna"}
existing_eruption = mycol.find_one(filter_criteria, {"_id": 0, "Eruptions.$": 1})

if existing_eruption:
    new_eruption = copy.deepcopy(existing_eruption["Eruptions"][0])

    new_eruption["Last_Known_Eruption"] = "2023 CE"
    new_eruption["Volcano_Number"] = 123456 

    mycol.update_one({"Eruptions.Volcano_Name": "Etna"}, {"$push": {"Eruptions": new_eruption}})

    print("New eruption added for Mount Etna.")
else:
    print("Existing eruption details for Etna not found.")

pipeline = [
    {"$match": {"Eruptions.Volcano_Name": "Etna"}},
    {"$unwind": "$Eruptions"},
    {"$project": {"_id": 0, "Eruptions": 1}}
]

for row in mycol.aggregate(pipeline):
    print(json.dumps(row, indent=2))


In [None]:
client.close()