# Volcano Eruptions Dataset for MongoDB Assignment

### By Callum O'Brien, C21306503

In [204]:
import pandas as pd
from pymongo import MongoClient
import json
from bson.son import SON
import copy

df = pd.read_csv('eruptions.csv', sep=',', delimiter=None, encoding='UTF-8')

The column names in this dataset contain spaces and as such need to be replaced. 
I replace them here with underscores.

In [205]:
# Replace column name spaces with underscores
df.columns = list(map(lambda x: x.replace(" ", "_"), df.columns))
print(df.dtypes)

Volcano_Number            int64
Volcano_Name             object
Country                  object
Primary_Volcano_Type     object
Activity_Evidence        object
Last_Known_Eruption      object
Region                   object
Subregion                object
Latitude                float64
Longitude               float64
Elevation_(m)             int64
Dominant_Rock_Type       object
Tectonic_Setting         object
dtype: object


One column is called Elevation_(m). I will rename this to just 'Elevation'.

In [206]:
df = df.rename(columns={"Elevation_(m)": "Elevation"})
print(df.columns)

Index(['Volcano_Number', 'Volcano_Name', 'Country', 'Primary_Volcano_Type',
       'Activity_Evidence', 'Last_Known_Eruption', 'Region', 'Subregion',
       'Latitude', 'Longitude', 'Elevation', 'Dominant_Rock_Type',
       'Tectonic_Setting'],
      dtype='object')


Next I clean the data, looking for null/N/A values.

In [207]:
print("Volcano_Number", df.Volcano_Number.isnull().sum())
print("Volcano_Name", df.Volcano_Name.isnull().sum())
print("Country", df.Country.isnull().sum())
print("Primary_Volcano_Type", df.Primary_Volcano_Type.isnull().sum())
print("Activity_Evidence", df.Activity_Evidence.isnull().sum())
print("Last_Known_Eruption", df.Last_Known_Eruption.isnull().sum())
print("Region", df.Region.isnull().sum())
print("Subregion", df.Subregion.isnull().sum())
print("Latitude", df.Latitude.isnull().sum())
print("Longitude", df.Longitude.isnull().sum())
print("Elevation", df.Elevation.isnull().sum())
print("Dominant_Rock_Type", df.Dominant_Rock_Type.isnull().sum())
print("Tectonic_Setting", df.Tectonic_Setting.isnull().sum())

Volcano_Number 0
Volcano_Name 0
Country 0
Primary_Volcano_Type 0
Activity_Evidence 0
Last_Known_Eruption 0
Region 0
Subregion 0
Latitude 0
Longitude 0
Elevation 0
Dominant_Rock_Type 14
Tectonic_Setting 5


In [208]:
print(df[df.Dominant_Rock_Type.isnull()])
print(df[df.Tectonic_Setting.isnull()])

      Volcano_Number                        Volcano_Name  \
18            211080                             Marsili   
232           242005                      Havre Seamount   
254           243111                              Dugong   
255           243112                             Lobster   
258           243140                            Niuatahi   
557           283069                        Akandanayama   
642           284305  Mariana Back-Arc Segment at 15.5°N   
659           285083                           Tenchozan   
671           290072                Odamoisan [Tebenkov]   
990           331005                 West Valley Segment   
997           331032                     Jackson Segment   
999           331060              East Blanco Depression   
1342          370010                     Snaefellsjokull   
1435          390847                            Melville   

                             Country Primary_Volcano_Type   Activity_Evidence  \
18                

In [209]:
df['Dominant_Rock_Type'].fillna("Not Specified", inplace=True)
df['Tectonic_Setting'].fillna("Not Specified", inplace=True)

print(df[['Volcano_Number',
          'Volcano_Name',
          'Country',
          'Primary_Volcano_Type',
          'Activity_Evidence',
          'Last_Known_Eruption', 
          'Region', 
          'Subregion',
          'Latitude', 
          'Longitude',
          'Elevation',
          'Dominant_Rock_Type',
          'Tectonic_Setting']].isnull().values.any())

False


I'll check if Country, Region and Subregion are 1:1.

In [210]:
for i in sorted(df.Country.unique()):
     if len(df[df.Country==i].Region.unique())!=1:
        print(i,"Not 1 region")
     if len(df[df.Country==i].Subregion.unique())!=1:
        print(i,"Not 1 subregion", len(df[df.Country==i].Subregion.unique()))

Argentina Not 1 subregion 3
Australia Not 1 region
Australia Not 1 subregion 2
Canada Not 1 region
Canada Not 1 subregion 2
Chile Not 1 subregion 4
Chile-Argentina Not 1 subregion 3
China Not 1 region
China Not 1 subregion 3
Ecuador Not 1 subregion 2
France Not 1 region
France Not 1 subregion 7
Iceland Not 1 subregion 6
Indonesia Not 1 subregion 8
Iran Not 1 subregion 2
Japan Not 1 subregion 4
New Zealand Not 1 subregion 3
Norway Not 1 region
Norway Not 1 subregion 2
Papua New Guinea Not 1 subregion 6
Philippines Not 1 subregion 4
Russia Not 1 region
Russia Not 1 subregion 4
Solomon Islands Not 1 subregion 2
Spain Not 1 region
Spain Not 1 subregion 2
Undersea Features Not 1 region
Undersea Features Not 1 subregion 8
United Kingdom Not 1 region
United Kingdom Not 1 subregion 4
United States Not 1 region
United States Not 1 subregion 18
Yemen Not 1 region
Yemen Not 1 subregion 2


The country, region and subregion are not 1:1, so I will instead base the structure of the document on the Country itself.

In [211]:
volcano_df = df[['Country']].drop_duplicates()
print(f"\n{volcano_df.Country}\n")
print(f"\n{volcano_df.describe(include='all')}\n")
print(f"\n{volcano_df.isnull().values.any()}\n")


0          Germany
1           France
2            Spain
4            Italy
19          Greece
           ...    
1342       Iceland
1372        Norway
1375      Portugal
1395    Cape Verde
1403    Antarctica
Name: Country, Length: 95, dtype: object


        Country
count        95
unique       95
top     Germany
freq          1


False



The Country list contains 95 unique values and no null values, and so is suitable for use.

In [212]:
uri = uri = 'mongodb://admin:Sp00ky!@localhost:27017/?AuthSource=admin'
client = MongoClient(uri)

Validating the data structure about to be inserted.

In [213]:
mydb = client['Volcano']
mycol = mydb['Country']
if "Country" in mydb.list_collection_names():
    mycol.drop()

mycol = mydb.create_collection(
    name="Country",
    validator={"$jsonSchema": {
        "required": ["Country"],
        "properties": {
            "Country": {
                "bsonType": "string",
                "pattern": "[a-z0-9]{2,20}",
                "description": "Name of country. 2 to 20 char"
            },
        }
    }},
    validationAction="error",
)

In [214]:
for row in volcano_df.itertuples():
    eruptions = df[df.Country == row.Country][['Volcano_Number',
                                               'Volcano_Name',
                                               'Primary_Volcano_Type',
                                               'Region',
                                               'Subregion',
                                               'Activity_Evidence',
                                               'Last_Known_Eruption',
                                               'Latitude',
                                               'Longitude',
                                               'Elevation',
                                               'Dominant_Rock_Type',
                                               'Tectonic_Setting']]
    
    entries = json.dumps({"Country": row.Country,
                          "Eruptions": eruptions.to_dict('records')})

    x = mycol.insert_one(json.loads(entries))



# Queries

1. All documents in JSON format: For a nicer output, the object ID is excluded from the result

In [215]:
for row in mycol.find({}, {"_id": 0}):
    print(json.dumps(row, indent = 2))

{
  "Country": "Germany",
  "Eruptions": [
    {
      "Volcano_Number": 210010,
      "Volcano_Name": "West Eifel Volcanic Field",
      "Primary_Volcano_Type": "Maar(s)",
      "Region": "Mediterranean and Western Asia",
      "Subregion": "Western Europe",
      "Activity_Evidence": "Eruption Dated",
      "Last_Known_Eruption": "8300 BCE",
      "Latitude": 50.17,
      "Longitude": 6.85,
      "Elevation": 600,
      "Dominant_Rock_Type": "Foidite",
      "Tectonic_Setting": "Rift zone / Continental crust (>25 km)"
    }
  ]
}
{
  "Country": "France",
  "Eruptions": [
    {
      "Volcano_Number": 210020,
      "Volcano_Name": "Chaine des Puys",
      "Primary_Volcano_Type": "Lava dome(s)",
      "Region": "Mediterranean and Western Asia",
      "Subregion": "Western Europe",
      "Activity_Evidence": "Eruption Dated",
      "Last_Known_Eruption": "4040 BCE",
      "Latitude": 45.775,
      "Longitude": 2.97,
      "Elevation": 1464,
      "Dominant_Rock_Type": "Basalt / Picro-Ba

2. Embedded Array Data: All eruptions found in the Western Europe subregion.

In [216]:
for row in mycol.find({"Eruptions.Subregion": "Western Europe"}, {"_id": 0}):
    print(json.dumps(row, indent = 2))

{
  "Country": "Germany",
  "Eruptions": [
    {
      "Volcano_Number": 210010,
      "Volcano_Name": "West Eifel Volcanic Field",
      "Primary_Volcano_Type": "Maar(s)",
      "Region": "Mediterranean and Western Asia",
      "Subregion": "Western Europe",
      "Activity_Evidence": "Eruption Dated",
      "Last_Known_Eruption": "8300 BCE",
      "Latitude": 50.17,
      "Longitude": 6.85,
      "Elevation": 600,
      "Dominant_Rock_Type": "Foidite",
      "Tectonic_Setting": "Rift zone / Continental crust (>25 km)"
    }
  ]
}
{
  "Country": "France",
  "Eruptions": [
    {
      "Volcano_Number": 210020,
      "Volcano_Name": "Chaine des Puys",
      "Primary_Volcano_Type": "Lava dome(s)",
      "Region": "Mediterranean and Western Asia",
      "Subregion": "Western Europe",
      "Activity_Evidence": "Eruption Dated",
      "Last_Known_Eruption": "4040 BCE",
      "Latitude": 45.775,
      "Longitude": 2.97,
      "Elevation": 1464,
      "Dominant_Rock_Type": "Basalt / Picro-Ba

3. Selection showing Projection: The name of all volcanoes where an eruption was observed.

In [217]:
for row in mycol.find({"Eruptions.Activity_Evidence": "Eruption Observed"}, {"_id": 0, "Eruptions.Volcano_Name": 1}):
    print(json.dumps(row, indent = 2))

{
  "Eruptions": [
    {
      "Volcano_Name": "Chaine des Puys"
    },
    {
      "Volcano_Name": "Mayotte Island"
    },
    {
      "Volcano_Name": "Fournaise, Piton de la"
    },
    {
      "Volcano_Name": "Boomerang Seamount"
    },
    {
      "Volcano_Name": "Amsterdam Island"
    },
    {
      "Volcano_Name": "St. Paul"
    },
    {
      "Volcano_Name": "Kerguelen Islands"
    },
    {
      "Volcano_Name": "Est, Ile de l'"
    },
    {
      "Volcano_Name": "Possession, Ile de la"
    },
    {
      "Volcano_Name": "Cochons, Ile aux"
    },
    {
      "Volcano_Name": "Wallis Islands"
    },
    {
      "Volcano_Name": "Eastern Gemini Seamount"
    },
    {
      "Volcano_Name": "Matthew Island"
    },
    {
      "Volcano_Name": "Hunter Island"
    },
    {
      "Volcano_Name": "Teahitia"
    },
    {
      "Volcano_Name": "Rocard"
    },
    {
      "Volcano_Name": "Moua Pihaa"
    },
    {
      "Volcano_Name": "Mehetia"
    },
    {
      "Volcano_Name": "Soufriere Gu

4. Selection with Sorted Output using Aggregated Pipelines: Volcanoes in New Zealand, sorted by their Elevation

In [218]:
pipeline = [
    {"$match": {"Eruptions.Subregion": "New Zealand"}},
    {"$unwind": "$Eruptions"},
    {"$sort": SON([("Eruptions.Elevation", 1)])},
    {"$group": {"_id": "$_id", "Eruptions": {"$push": "$Eruptions"}}},
    {"$project": {"_id": 0, "Eruptions.Volcano_Name": 1, "Eruptions.Elevation": 1}},
]

result = mycol.aggregate(pipeline)

for row in result:
    print(json.dumps(row, indent=2))

{
  "Eruptions": [
    {
      "Volcano_Name": "Brothers",
      "Elevation": -1350
    },
    {
      "Volcano_Name": "Rumble II West",
      "Elevation": -1200
    },
    {
      "Volcano_Name": "Healy",
      "Elevation": -980
    },
    {
      "Volcano_Name": "Wright",
      "Elevation": -900
    },
    {
      "Volcano_Name": "Havre Seamount",
      "Elevation": -897
    },
    {
      "Volcano_Name": "Clark",
      "Elevation": -860
    },
    {
      "Volcano_Name": "Tangaroa",
      "Elevation": -600
    },
    {
      "Volcano_Name": "Rumble IV",
      "Elevation": -500
    },
    {
      "Volcano_Name": "Rumble V",
      "Elevation": -400
    },
    {
      "Volcano_Name": "Rumble III",
      "Elevation": -220
    },
    {
      "Volcano_Name": "Monowai",
      "Elevation": -132
    },
    {
      "Volcano_Name": "Giggenbach",
      "Elevation": -65
    },
    {
      "Volcano_Name": "Curtis Island",
      "Elevation": 47
    },
    {
      "Volcano_Name": "Macauley",
      

# Updating a Document

The volcano on La Palma in the Canary Islands erupted in 2021. I'll update the data to reflect that.

In [219]:
# Your filter criteria
filter_criteria = {"Eruptions.Volcano_Name": "La Palma"}

# Your update criteria
update_criteria = {"$set": {"Eruptions.$.Last_Known_Eruption": "2021 CE"}}

# Check if the document exists
existing_document = mycol.find_one(filter_criteria)

if existing_document:
    # Document exists, so update it
    mycol.update_one(filter_criteria, update_criteria)
    print("Document updated successfully.")
else:
    # Document does not exist
    print("Document not found.")

result = mycol.find_one(
    {"Eruptions.Volcano_Name": "La Palma"},
    {"_id": 0, "Eruptions.$": 1}
)

print(json.dumps(result, indent=2))

Document updated successfully.
{
  "Eruptions": [
    {
      "Volcano_Number": 383010,
      "Volcano_Name": "La Palma",
      "Primary_Volcano_Type": "Stratovolcano(es)",
      "Region": "Atlantic Ocean",
      "Subregion": "Canary Islands",
      "Activity_Evidence": "Eruption Observed",
      "Last_Known_Eruption": "2021 CE",
      "Latitude": 28.57,
      "Longitude": -17.83,
      "Elevation": 2426,
      "Dominant_Rock_Type": "Trachybasalt / Tephrite Basanite",
      "Tectonic_Setting": "Intraplate / Oceanic crust (< 15 km)"
    }
  ]
}


# Deleting a Document

In [220]:
# Your filter criteria
filter_criteria = {"Eruptions.Volcano_Name": "Porak"}

# Check if the document exists
existing_document = mycol.find_one(filter_criteria)

if existing_document:
    # Document exists, so update it
    mycol.delete_one(filter_criteria)
    print("Document deleted successfully.")
else:
    # Document does not exist
    print("Document not found.")

Document deleted successfully.


# Inserting a Document

There already exists an element in the subarray for Mount Etna, as it erupted in 2018. It also erupted in 2023, so I will copy the existing details and update the copy with the new details.

In [221]:
import copy

# Your filter criteria to identify the document for Mount Etna
filter_criteria = {"Eruptions.Volcano_Name": "Etna"}

# Query the database to retrieve the existing eruption details for Etna
existing_eruption = mycol.find_one(filter_criteria, {"_id": 0, "Eruptions.$": 1})

if existing_eruption:
    # Deep copy the existing eruption data
    new_eruption = copy.deepcopy(existing_eruption["Eruptions"][0])

    # Update the necessary fields for the new eruption
    new_eruption["Last_Known_Eruption"] = "2023 CE"
    new_eruption["Volcano_Number"] = 123456 

    # Insert the new eruption for Mount Etna
    mycol.update_one({"Eruptions.Volcano_Name": "Etna"}, {"$push": {"Eruptions": new_eruption}})

    print("New eruption added for Mount Etna.")
else:
    print("Existing eruption details for Etna not found.")

pipeline = [
    {"$match": {"Eruptions.Volcano_Name": "Etna"}},
    {"$unwind": "$Eruptions"},
    {"$match": {"Eruptions.Volcano_Name": "Etna"}},
    {"$project": {"_id": 0, "Eruptions": 1}}
]

for row in mycol.aggregate(pipeline):
    print(json.dumps(row, indent=2))


New eruption added for Mount Etna.
{
  "Eruptions": {
    "Volcano_Number": 211060,
    "Volcano_Name": "Etna",
    "Primary_Volcano_Type": "Stratovolcano(es)",
    "Region": "Mediterranean and Western Asia",
    "Subregion": "Italy",
    "Activity_Evidence": "Eruption Observed",
    "Last_Known_Eruption": "2018 CE",
    "Latitude": 37.748,
    "Longitude": 14.999,
    "Elevation": 3295,
    "Dominant_Rock_Type": "Trachybasalt / Tephrite Basanite",
    "Tectonic_Setting": "Subduction zone / Continental crust (>25 km)"
  }
}
{
  "Eruptions": {
    "Volcano_Number": 123456,
    "Volcano_Name": "Etna",
    "Primary_Volcano_Type": "Stratovolcano(es)",
    "Region": "Mediterranean and Western Asia",
    "Subregion": "Italy",
    "Activity_Evidence": "Eruption Observed",
    "Last_Known_Eruption": "2023 CE",
    "Latitude": 37.748,
    "Longitude": 14.999,
    "Elevation": 3295,
    "Dominant_Rock_Type": "Trachybasalt / Tephrite Basanite",
    "Tectonic_Setting": "Subduction zone / Continent

In [222]:
client.close()