# Volcano Eruptions Dataset for MongoDB Assignment

### By Callum O'Brien, C21306503

In [160]:
import pandas as pd
from pymongo import MongoClient
import json
from bson.son import SON
from pprint import pprint

df = pd.read_csv('eruptions.csv', sep=',', delimiter=None, encoding='UTF-8')


The column names in this dataset contain spaces and as such need to be replaced. 
I replace them here with underscores.

In [161]:
# Replace column name spaces with underscores
df.columns = list(map(lambda x: x.replace(" ", "_"), df.columns))
print(df.dtypes)

Volcano_Number            int64
Volcano_Name             object
Country                  object
Primary_Volcano_Type     object
Activity_Evidence        object
Last_Known_Eruption      object
Region                   object
Subregion                object
Latitude                float64
Longitude               float64
Elevation_(m)             int64
Dominant_Rock_Type       object
Tectonic_Setting         object
dtype: object


One column is called Elevation_(m). I will rename this to just 'Elevation'.

In [162]:
df = df.rename(columns={"Elevation_(m)": "Elevation"})
print(df.columns)

Index(['Volcano_Number', 'Volcano_Name', 'Country', 'Primary_Volcano_Type',
       'Activity_Evidence', 'Last_Known_Eruption', 'Region', 'Subregion',
       'Latitude', 'Longitude', 'Elevation', 'Dominant_Rock_Type',
       'Tectonic_Setting'],
      dtype='object')


Next I clean the data, looking for null/N/A values.

In [163]:
print("Volcano_Number", df.Volcano_Number.isnull().sum())
print("Volcano_Name", df.Volcano_Name.isnull().sum())
print("Country", df.Country.isnull().sum())
print("Primary_Volcano_Type", df.Primary_Volcano_Type.isnull().sum())
print("Activity_Evidence", df.Activity_Evidence.isnull().sum())
print("Last_Known_Eruption", df.Last_Known_Eruption.isnull().sum())
print("Region", df.Region.isnull().sum())
print("Subregion", df.Subregion.isnull().sum())
print("Latitude", df.Latitude.isnull().sum())
print("Longitude", df.Longitude.isnull().sum())
print("Elevation", df.Elevation.isnull().sum())
print("Dominant_Rock_Type", df.Dominant_Rock_Type.isnull().sum())
print("Tectonic_Setting", df.Tectonic_Setting.isnull().sum())

Volcano_Number 0
Volcano_Name 0
Country 0
Primary_Volcano_Type 0
Activity_Evidence 0
Last_Known_Eruption 0
Region 0
Subregion 0
Latitude 0
Longitude 0
Elevation 0
Dominant_Rock_Type 14
Tectonic_Setting 5


In [164]:
print(df[df.Dominant_Rock_Type.isnull()])
print(df[df.Tectonic_Setting.isnull()])

      Volcano_Number                        Volcano_Name  \
18            211080                             Marsili   
232           242005                      Havre Seamount   
254           243111                              Dugong   
255           243112                             Lobster   
258           243140                            Niuatahi   
557           283069                        Akandanayama   
642           284305  Mariana Back-Arc Segment at 15.5°N   
659           285083                           Tenchozan   
671           290072                Odamoisan [Tebenkov]   
990           331005                 West Valley Segment   
997           331032                     Jackson Segment   
999           331060              East Blanco Depression   
1342          370010                     Snaefellsjokull   
1435          390847                            Melville   

                             Country Primary_Volcano_Type   Activity_Evidence  \
18                

In [165]:
df['Dominant_Rock_Type'].fillna("Not Specified", inplace=True)
df['Tectonic_Setting'].fillna("Not Specified", inplace=True)

print(df[['Volcano_Number',
          'Volcano_Name',
          'Country',
          'Primary_Volcano_Type',
          'Activity_Evidence',
          'Last_Known_Eruption', 
          'Region', 
          'Subregion',
          'Latitude', 
          'Longitude',
          'Elevation',
          'Dominant_Rock_Type',
          'Tectonic_Setting']].isnull().values.any())

False


I'll check if Country, Region and Subregion are 1:1.

In [166]:
for i in sorted(df.Country.unique()):
     if len(df[df.Country==i].Region.unique())!=1:
        print(i,"Not 1 region")
     if len(df[df.Country==i].Subregion.unique())!=1:
        print(i,"Not 1 subregion", len(df[df.Country==i].Subregion.unique()))

Argentina Not 1 subregion 3
Australia Not 1 region
Australia Not 1 subregion 2
Canada Not 1 region
Canada Not 1 subregion 2
Chile Not 1 subregion 4
Chile-Argentina Not 1 subregion 3
China Not 1 region
China Not 1 subregion 3
Ecuador Not 1 subregion 2
France Not 1 region
France Not 1 subregion 7
Iceland Not 1 subregion 6
Indonesia Not 1 subregion 8
Iran Not 1 subregion 2
Japan Not 1 subregion 4
New Zealand Not 1 subregion 3
Norway Not 1 region
Norway Not 1 subregion 2
Papua New Guinea Not 1 subregion 6
Philippines Not 1 subregion 4
Russia Not 1 region
Russia Not 1 subregion 4
Solomon Islands Not 1 subregion 2
Spain Not 1 region
Spain Not 1 subregion 2
Undersea Features Not 1 region
Undersea Features Not 1 subregion 8
United Kingdom Not 1 region
United Kingdom Not 1 subregion 4
United States Not 1 region
United States Not 1 subregion 18
Yemen Not 1 region
Yemen Not 1 subregion 2


The country, region and subregion are not 1:1, so I will instead base the structure of the document on the Country itself.

In [167]:
volcano_df = df[['Country']].drop_duplicates()
print(f"\n{volcano_df.Country}\n")
print(f"\n{volcano_df.describe(include='all')}\n")
print(f"\n{volcano_df.isnull().values.any()}\n")


0          Germany
1           France
2            Spain
4            Italy
19          Greece
           ...    
1342       Iceland
1372        Norway
1375      Portugal
1395    Cape Verde
1403    Antarctica
Name: Country, Length: 95, dtype: object


        Country
count        95
unique       95
top     Germany
freq          1


False



The Country list contains 95 unique values and no null values, and so is suitable for use.

In [168]:
uri = uri = 'mongodb://admin:Sp00ky!@localhost:27017/?AuthSource=admin'
client = MongoClient(uri)

Validating the data structure about to be inserted.

In [169]:
mydb = client['Volcano']
mycol = mydb['Country']
if "Country" in mydb.list_collection_names():
    mycol.drop()

mycol = mydb.create_collection(
    name="Country",
    validator={"$jsonSchema": {
        "required": ["Country"],
        "properties": {
            "Country": {
                "bsonType": "string",
                "pattern": "[a-z0-9]{2,20}",
                "description": "Name of country. 2 to 20 char"
            },
        }
    }},
    validationAction="error",
)

In [170]:
for row in volcano_df.itertuples():
    eruptions = df[df.Country == row.Country][['Volcano_Number',
                                               'Volcano_Name',
                                               'Primary_Volcano_Type',
                                               'Region',
                                               'Subregion',
                                               'Activity_Evidence',
                                               'Last_Known_Eruption',
                                               'Latitude',
                                               'Longitude',
                                               'Elevation',
                                               'Dominant_Rock_Type',
                                               'Tectonic_Setting']]
    
    entries = json.dumps({"Country": row.Country,
                          "Eruptions": eruptions.to_dict('records')})

    x = mycol.insert_one(json.loads(entries))



# Queries

1. All documents in JSON format: For a nicer output, the object ID is excluded from the result

In [None]:
for row in mycol.find({}, {"_id": 0}):
    print(json.dumps(row, indent = 2))

2. Embedded Array Data: All eruptions found in the Western Europe subregion.

In [None]:
for row in mycol.find({"Eruptions.Subregion": "Western Europe"}, {"_id": 0}):
    print(json.dumps(row, indent = 2))

3. Selection showing Projection: