In [17]:
######################################################
#Name: Shweta Ydaav
#Student ID : 21209251
#Purpose : To acquire input json/csv file from web using api and uploading it to mongodb nosql database using python
#######################################################
import pymongo
import json
import requests
import pandas as pd
import os
from zipfile import ZipFile #importing the zipfile module

#Kaggle credential to read data from kaggle API
kaggleUsername = "panda05"
kaggleKey = "c25dc978d7e3721eccfd1f4814846223"

#Setting the enovironmemt credential variables
os.environ['KAGGLE_USERNAME'] = kaggleUsername
os.environ['KAGGLE_KEY'] = kaggleKey


def mongoDBConnection():
    try:
        #Setting port number for localhost
        dbPortNumber = '27017'  
        #Create a client to connect to server DB
        mongoClient = pymongo.MongoClient(f'mongodb://localhost:{dbPortNumber}')
        print("Successfully connected to mongoDB")
        #Connect to database if exists, create db if it doesn't exist.
        mongoDB = mongoClient['dapDatabase']
        print("Successfully connected/created database")
        #Create  the tables in the database.
        changeInSeaLevel = mongoDB['changeInSeaLevel']
        print("Successfully created table changeInSeaLevel")
        surfaceTemperature = mongoDB['surfaceTemperature']
        print("Successfully created table surfaceTemperature")
        naturalDisaster = mongoDB['naturalDisaster']
        print("Successfully created table naturalDisaster")
        return mongoClient, changeInSeaLevel, surfaceTemperature, naturalDisaster
    except Exception as e:
        print("Connection to mongoDB failed with error: "+str(e))
        sys.exit(1)   

#Kaggle API call to download the data folder
def kaggleAPICall(dataset, folder):
    print("Pulling data from Kaggle for "+ dataset)
    try:
        if ('natural' in dataset):
            !kaggle datasets download -d brsdincer/all-natural-disasters-19002021-eosdis
        else:
            !kaggle datasets download -d berkeleyearth/climate-change-earth-surface-temperature-data
        #Extracting the data file from downloaded zip
        with ZipFile(dataset+".zip", 'r') as zObject:
            zObject.extractall(path = folder)
            #Deleting the zipped folder
        os.remove(dataset+".zip")
        print("File successfully downloaded for "+dataset)
    except Exception as e:
        print("Data download failed for "+ dataset+ " with error :" + str(e))
        sys.exit(1)

#Function to load json data from IMF
def imfJsonDataLoad(changeInSeaLevel):
    #API call to load data for change in sea level
    #The API can only pull 2000 records in one call. Using exceededTransferLimit parameter in api response to loop and pull all 35k records
    response = requests.get('https://services9.arcgis.com/weJ1QsnbMYJlCHdG/arcgis/rest/services/Indicator_3_3_melted_new/FeatureServer/0/query?where=1%3D1&resultRecordCount=35000&outFields=*&outSR=4326&f=json')
    listToWrite = response.json()
    count = 0
    offset = 0
    try :
        while (listToWrite['exceededTransferLimit']):
            lisOfRecords = listToWrite['features']
            for item in lisOfRecords:
                attr = item['attributes']
                attr['ObjectId'] = str(attr['ObjectId'])
                attr['Country'] = str(attr['Country'])
                attr['ISO2'] = str(attr['ISO2'])
                attr['ISO3'] = str(attr['ISO3'])
                attr['Indicator'] = str(attr['Indicator'])
                attr['Unit'] = str(attr['Unit'])
                attr['Source'] = str(attr['Source'])
                attr['CTS_Code'] = str(attr['CTS_Code'])
                attr['CTS_Name'] = str(attr['CTS_Name'])
                attr['CTS_Full_Descriptor'] = str(attr['CTS_Full_Descriptor'])
                attr['Measure'] = str(attr['Measure'])
                attr['Date'] = str(attr['Date'])
                attr['Value'] = str(attr['Value'])
                insertProcess = changeInSeaLevel.insert_one(attr)
                count = count+1
            offset = offset + 2000
            response = requests.get('https://services9.arcgis.com/weJ1QsnbMYJlCHdG/arcgis/rest/services/Indicator_3_3_melted_new/FeatureServer/0/query?where=1%3D1&resultOffset='+str(offset)+'&outFields=*&outSR=4326&f=json')
            listToWrite = response.json()
    except Exception as e:
        if ("'exceededTransferLimit'" in str(e)):
            lisOfRecords = listToWrite['features']
            for item in lisOfRecords:
                attr = item['attributes']
                attr['ObjectId'] = str(attr['ObjectId'])
                attr['Country'] = str(attr['Country'])
                attr['ISO2'] = str(attr['ISO2'])
                attr['ISO3'] = str(attr['ISO3'])
                attr['Indicator'] = str(attr['Indicator'])
                attr['Unit'] = str(attr['Unit'])
                attr['Source'] = str(attr['Source'])
                attr['CTS_Code'] = str(attr['CTS_Code'])
                attr['CTS_Name'] = str(attr['CTS_Name'])
                attr['CTS_Full_Descriptor'] = str(attr['CTS_Full_Descriptor'])
                attr['Measure'] = str(attr['Measure'])
                attr['Date'] = str(attr['Date'])
                attr['Value'] = str(attr['Value'])
                insertProcess = changeInSeaLevel.insert_one(attr)
                count = count+1
        else:
            print("Data load failed with error: "+str(e))
            sys.exit(1)
    #total number of records loaded in data
    print("Data for change in sea level successfully loaded to mongoDB. Records loaded: "+ str(count))    

#Function to load Kaggle csv data from local
def kaggleCsvDataLoad(surfaceTemperature, naturalDisaster):    
    #loading data for surface temperature
    print("Loading data for surface temperature")
    try:
        data = pd.read_csv('surface-temperature/GlobalLandTemperaturesByMajorCity.csv')
        docs = json.loads(data.T.to_json()).values()
        surfaceTemperature.insert_many(docs)
        print("Data for surface temperature successfully loaded to mongoDB.")
    except Exception as e:
        print('Data load to mongodb failed with error '+ str(e))
        sys.exit(1)

    #loading data for natural disaster
    print("Loading data for natural disaster")
    try:
        data = pd.read_csv('natural-disaster/DISASTERS/1970-2021_DISASTERS.xlsx - emdat data.csv')
        docs = json.loads(data.T.to_json()).values()
        naturalDisaster.insert_many(docs)
        print("Data for natural disaster successfully loaded to mongoDB.")
    except Exception as e:
        print('Data load to mongodb failed with error '+ str(e)) 
        sys.exit(1)    
    
if __name__ == "__main__":
    
    try:
        print("Main function")
        print("Function call to connect with mongoDB server")
        mongoClient, table1, table2, table3 = mongoDBConnection()
        print("Function call to get csv data from Kaggle")
        kaggleAPICall('all-natural-disasters-19002021-eosdis', 'natural-disaster')
        kaggleAPICall('climate-change-earth-surface-temperature-data', 'surface-temperature')
        print("Function call to get json data from IMF and load to mongoDB")
        imfJsonDataLoad(table1)
        print("Function call to load csv data to mongoDB")
        kaggleCsvDataLoad(table2, table3)
        mongoClient.close()
        print("MongoDB connection closed")
    except Exception as e:
        print('Exception raised '+ str(e))
        print("Error in main function")
        sys.exit(1)

Main function
Function call to connect with mongoDB server
Successfully connected to mongoDB
Successfully connected/created database
Successfully created table changeInSeaLevel
Successfully created table surfaceTemperature
Successfully created table naturalDisaster
Function call to get csv data from Kaggle
Pulling data from Kaggle for all-natural-disasters-19002021-eosdis
Downloading all-natural-disasters-19002021-eosdis.zip to C:\NCI Content\DAP Lab\Project

File successfully downloaded for all-natural-disasters-19002021-eosdis
Pulling data from Kaggle for climate-change-earth-surface-temperature-data



  0%|          | 0.00/2.31M [00:00<?, ?B/s]
 43%|####3     | 1.00M/2.31M [00:00<00:00, 9.11MB/s]
 86%|########6 | 2.00M/2.31M [00:00<00:00, 9.73MB/s]
100%|##########| 2.31M/2.31M [00:00<00:00, 10.2MB/s]


Downloading climate-change-earth-surface-temperature-data.zip to C:\NCI Content\DAP Lab\Project




  0%|          | 0.00/84.7M [00:00<?, ?B/s]
  1%|1         | 1.00M/84.7M [00:00<00:10, 8.20MB/s]
  4%|3         | 3.00M/84.7M [00:00<00:08, 10.4MB/s]
  6%|5         | 5.00M/84.7M [00:00<00:07, 11.4MB/s]
  8%|8         | 7.00M/84.7M [00:00<00:06, 12.1MB/s]
 11%|#         | 9.00M/84.7M [00:00<00:06, 12.2MB/s]
 13%|#2        | 11.0M/84.7M [00:00<00:06, 12.3MB/s]
 15%|#5        | 13.0M/84.7M [00:01<00:06, 12.4MB/s]
 18%|#7        | 15.0M/84.7M [00:01<00:05, 12.2MB/s]
 20%|##        | 17.0M/84.7M [00:01<00:05, 12.3MB/s]
 22%|##2       | 19.0M/84.7M [00:01<00:05, 12.1MB/s]
 25%|##4       | 21.0M/84.7M [00:01<00:05, 12.2MB/s]
 27%|##7       | 23.0M/84.7M [00:02<00:05, 12.2MB/s]
 30%|##9       | 25.0M/84.7M [00:02<00:05, 12.4MB/s]
 32%|###1      | 27.0M/84.7M [00:02<00:04, 12.2MB/s]
 34%|###4      | 29.0M/84.7M [00:02<00:04, 12.6MB/s]
 37%|###6      | 31.0M/84.7M [00:02<00:04, 12.5MB/s]
 39%|###8      | 33.0M/84.7M [00:02<00:04, 12.2MB/s]
 41%|####1     | 35.0M/84.7M [00:03<00:04, 12.3MB/s]
 

File successfully downloaded for climate-change-earth-surface-temperature-data
Function call to get json data from IMF and load to mongoDB
Data for change in sea level successfully loaded to mongoDB. Records loaded: 35604
Function call to load csv data to mongoDB
Loading data for surface temperature
Data for surface temperature successfully loaded to mongoDB.
Loading data for natural disaster
Data for natural disaster successfully loaded to mongoDB.
MongoDB connection closed
