# Volcano Eruptions Dataset for MongoDB Assignment

### By Callum O'Brien, C21306503

In [50]:
import pandas as pd
from pymongo import MongoClient
import json

df = pd.read_csv('eruptions.csv', sep=',', delimiter=None, encoding='UTF-8')


The column names in this dataset contain spaces and as such need to be replaced. 
I replace them here with underscores.

In [51]:
# Replace column name spaces with underscores
df.columns = list(map(lambda x: x.replace(" ", "_"), df.columns))
print(df.dtypes)

Volcano_Number            int64
Volcano_Name             object
Country                  object
Primary_Volcano_Type     object
Activity_Evidence        object
Last_Known_Eruption      object
Region                   object
Subregion                object
Latitude                float64
Longitude               float64
Elevation_(m)             int64
Dominant_Rock_Type       object
Tectonic_Setting         object
dtype: object


One column is called Elevation_(m). I will rename this to just 'Elevation'.

In [52]:
df = df.rename(columns={"Elevation_(m)": "Elevation"})
print(df.columns)

Index(['Volcano_Number', 'Volcano_Name', 'Country', 'Primary_Volcano_Type',
       'Activity_Evidence', 'Last_Known_Eruption', 'Region', 'Subregion',
       'Latitude', 'Longitude', 'Elevation', 'Dominant_Rock_Type',
       'Tectonic_Setting'],
      dtype='object')


Next I clean the data, looking for null/N/A values.

In [53]:
print("Volcano_Number", df.Volcano_Number.isnull().sum())
print("Volcano_Name", df.Volcano_Name.isnull().sum())
print("Country", df.Country.isnull().sum())
print("Primary_Volcano_Type", df.Primary_Volcano_Type.isnull().sum())
print("Activity_Evidence", df.Activity_Evidence.isnull().sum())
print("Last_Known_Eruption", df.Last_Known_Eruption.isnull().sum())
print("Region", df.Region.isnull().sum())
print("Subregion", df.Subregion.isnull().sum())
print("Latitude", df.Latitude.isnull().sum())
print("Longitude", df.Longitude.isnull().sum())
print("Elevation", df.Elevation.isnull().sum())
print("Dominant_Rock_Type", df.Dominant_Rock_Type.isnull().sum())
print("Tectonic_Setting", df.Tectonic_Setting.isnull().sum())

Volcano_Number 0
Volcano_Name 0
Country 0
Primary_Volcano_Type 0
Activity_Evidence 0
Last_Known_Eruption 0
Region 0
Subregion 0
Latitude 0
Longitude 0
Elevation 0
Dominant_Rock_Type 14
Tectonic_Setting 5


In [54]:
print(df[df.Dominant_Rock_Type.isnull()])
print(df[df.Tectonic_Setting.isnull()])

      Volcano_Number                        Volcano_Name  \
18            211080                             Marsili   
232           242005                      Havre Seamount   
254           243111                              Dugong   
255           243112                             Lobster   
258           243140                            Niuatahi   
557           283069                        Akandanayama   
642           284305  Mariana Back-Arc Segment at 15.5°N   
659           285083                           Tenchozan   
671           290072                Odamoisan [Tebenkov]   
990           331005                 West Valley Segment   
997           331032                     Jackson Segment   
999           331060              East Blanco Depression   
1342          370010                     Snaefellsjokull   
1435          390847                            Melville   

                             Country Primary_Volcano_Type   Activity_Evidence  \
18                

In [55]:
df['Dominant_Rock_Type'].fillna("Not Specified", inplace=True)
df['Tectonic_Setting'].fillna("Not Specified", inplace=True)

print(df[['Volcano_Number',
          'Volcano_Name',
          'Country',
          'Primary_Volcano_Type',
          'Activity_Evidence',
          'Last_Known_Eruption', 
          'Region', 
          'Subregion',
          'Latitude', 
          'Longitude',
          'Elevation',
          'Dominant_Rock_Type',
          'Tectonic_Setting']].isnull().values.any())

False


I'll now check for values that are 1:1 to set up a data structure.

In [56]:
for i in sorted(df.Volcano_Name.unique()):
    if len(df[df.Volcano_Name==i].Volcano_Number.unique())!=1:
        print(i)

Azul, Cerro
Flores
Plosky
San Cristobal
Santa Isabel
Sumbing
Unnamed


Volcano Name and Number are not 1:1, meaning some volcanoes erupted more than once. As such these values together cannot be used for the main structure of the DB as the Volcano Name will not be unique.

Instead, I'll base the data structure around the country in which the Volcano erupted.

In [57]:
volcano_df = df[['Country']].drop_duplicates()
print(f"\n{volcano_df.Country}\n")
print(f"\n{volcano_df.describe(include='all')}\n")
print(f"\n{volcano_df.isnull().values.any()}\n")


0          Germany
1           France
2            Spain
4            Italy
19          Greece
           ...    
1342       Iceland
1372        Norway
1375      Portugal
1395    Cape Verde
1403    Antarctica
Name: Country, Length: 95, dtype: object


        Country
count        95
unique       95
top     Germany
freq          1


False



The Country list contains 95 unique values and no null values, and so is suitable for use.

In [58]:
uri = uri = 'mongodb://admin:Sp00ky!@localhost:27017/?AuthSource=admin'
client = MongoClient(uri)

Validating the data structure about to be inserted.

In [59]:
mydb = client['Volcano']
mycol = mydb['Country']
if "Country" in mydb.list_collection_names():
    mycol.drop()

mycol = mydb.create_collection(
    name="Country",
    validator={"$jsonSchema": {
        "required": ["Country"],
        "properties": {
            "Country": {
                "bsonType": "string",
                "pattern": "[a-z0-9]{2,20}",
                "description": "Name of country. 2 to 20 char"
            },
        }
    }},
    validationAction="error",
)

In [60]:
for row in volcano_df.itertuples():
    eruptions = df[df.Country == row.Country][['Volcano_Number',
                                               'Volcano_Name',
                                               'Primary_Volcano_Type',
                                               'Region',
                                               'Subregion',
                                               'Activity_Evidence',
                                               'Last_Known_Eruption',
                                               'Latitude',
                                               'Longitude',
                                               'Elevation',
                                               'Dominant_Rock_Type',
                                               'Tectonic_Setting']]
    
    entries = json.dumps({"Country": row.Country,
                          "Eruptions": eruptions.to_dict('records')})

    x = mycol.insert_one(json.loads(entries))



# Queries

1. All documents in JSON format

In [61]:
cursor = mycol.find({})

# Convert documents to a DataFrame
docs = list(cursor)

print(json.dumps(cursor, default=str, indent=2))

"<pymongo.cursor.Cursor object at 0x1297943d0>"


In [62]:
client.close()