## DSCI 551 Project

Create an emulation-based system for distributed file storage and parallel computation. <br>
1. Building an emulated distributed file system (EDFS) <br>
- EDFS should support the following commands, similar to that in HDFS:
    - mkdir: create a directory in file system, e.g., *mdkir /user/john
    - ls: listing content of a given directory, e.g., *ls /user
    - cat: display content of a file, e.g., *cat /user/john/hello.txt
    - rm: remove a file from the file system, e.g., *rm /user/john/hello.txt
    - put: uploading file to a file system, e.g., *put (car.csv, /user/john, k = # partitions)* will upload a file cars.csv to the directory /user/john in EDFS. **But note that the file should be stored in k partitions, and the file system should remember where the partitions are stored.** you should design a method to partition data. you may also have the user indicate the method, e.g., hashing on certain car attribute, in the put method. 
    - getPartitionLocations(file): this method will return the location of partitions of the file.
    - readPartition(file, partition #): this method will return the content of partion # of the specified file. the portioned data will be needed in the second task for parallel processing. 
- **Note that EDFS should store the metadata about the file system** (similar to that in NameNode of HDFS, but much simplified). **Metadata include file system structure, attributes of files, and location of partitions storing the contents of files.** You can limit the type of files stored in the file system to certain format, e.g., .csv or JSON. 
<br><br>

#### Google Firebase address : https://dsci551-project-52d43-default-rtdb.firebaseio.com/
### Statistical Capacity Indicators 
###### Statistical Capacity Indicators provides information on various aspects of national statistical systems of developing countries, including an overall country-level statistical capacity indicator. Last Updated:02/03/2021
#### Data from : https://databank.worldbank.org/source/statistical-capacity-indicators# 



In [1]:
import pandas as pd
import numpy as np
import datetime
import requests
import json
import os
import re

firebase_url = 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/'

def seek(path):
    if not re.search('.json', path):
        url = firebase_url + path + '.json'
        
    try:
        rget = requests.get(url)
        return rget
    except:
        print('ERROR')

### MKDIR


In [None]:
def mkdir(path):
    if seek(path).json() is None:
        url = firebase_url + path + '.json'
        data = '{"test2":"2"}'
#         print (url)
        r = requests.put(url,data)
        print (r.url)
    else:
        print ('Directory ', path, ' already exists')

# requests.put('https://dsci551-project-52d43-default-rtdb.firebaseio.com/mk.json', '{"test":1}')

In [None]:
mkdir('DataNode') #Change to user input

### LS

In [None]:
def ls(path):
    # ADDING "NameNode/root/" to Firebase request path
    if not re.search('NameNode/root', path):
        path = 'NameNode/root/' + path
    
    if seek(path).json() is not None:
        for key in seek(path).json().keys():
            print(key)
    else:
        print (" ")


In [None]:
ls('data2/China') # Change to user input

### RM

In [None]:
def rm(path):
    if seek(path).json() is None:
        print ('Directory not found')
    else:
        url = firebase_url + path + '.json'
        d = requests.delete(url)
        if d.status_code == 200:
            print(path, 'was succefully deleted')

In [None]:
rm('NameNode/root/user')
rm('DataNode')

### PUT

In [2]:
# function to get year columns
def is_year (c):
    return any(char.isdigit() for char in c)

def varlist (var):
    names = list()
    for v in var:
        key = re.sub(r'[^A-Za-z0-9 ]+', '', v).replace(" ", "_")
        names.append(key if key != "" else "invalid_key")
    return names

def varname (var):
    key = re.sub(r'[^A-Za-z0-9 ]+', '', var).replace(" ", "_")
    names = key if key != "" else "invalid_key"
    return names

def mtime():
#     to revert back
    #datetime.datetime.utcfromtimestamp(int(mtime)/1000).strftime('%Y-%-m-%-d %I:%M:%S') 
    return (datetime.datetime.now().timestamp()*1000)

def filesize(file): #file size in bytes
    return  os.path.getsize(file)

In [12]:
def record_partition(path, country, filename, url):
    try:
        npath = firebase_url + path + "/" + filename + "/partitions.json"
    #     print (npath ,":", url)
        mdata = {country : url}
        putMeta = requests.patch(npath, json.dumps(mdata))
        if putMeta.status_code == 400: print(country)
    #     print (putMeta)
    except:
        print (country)

def file_mdata(path, file, filename):
    npath = firebase_url + path + "/" + filename + ".json"
    mdata = {'ctime': mtime(),
             'name': file,
             'type': 'FILE',
             'filesize':filesize(file)}
    putMeta = requests.patch(npath, json.dumps(mdata))
    

# partition by Country (Original plan)
def put(file, path):
    filename = file.replace(".csv","")
    data = pd.read_csv(file).dropna()
    
    # change columns names
    new_columns = list()
    columns = data.columns
    for c in columns:
        if is_year(c):
            new_columns.append(c[:4])
        else:
            new_columns.append(c.replace(" ","_"))
    
    # change column names in dataframe
    data.columns = new_columns

    # creating a list of country names, country code, series name, series code, and years of the data collected
    cname = data.Country_Name.unique().tolist()
    sname = varlist(data.Series_Name.unique().tolist())
    years = [n for n in data.columns if n.isnumeric()]

    # replacement of symbols that are invalid in Firebase
#     cname2 = [sub.replace ('.','') for sub in cname]
#     sname2 = [sub.replace('/','-') for sub in sname]


    # creating dictinary to organize data into correct json format. 
    # added 'file name' to the dictionary to help differentiate data from different files
    dc = dict()
    for country in cname:
            c = varname(country)
#             dc[c]={ filename :dict(zip(years, [dict()]*len(years)))}
            dc[c]=dict(zip(years, [dict()]*len(years)))
            t = data[data['Country_Name'] == country]    
            for year in years:
                dc[c][year]=dict(zip(sname,t[year]))
                      
    if seek(path + '/' +filename).json() is None:
        for k, v in dc.items():
            url = firebase_url + 'DataNode/' + k + '/' + filename + '.json'
            putResponse = requests.put(url, json.dumps(dc[k]))
            if putResponse.status_code == 200:
                record_partition (path, k, filename, putResponse.url)
            else:
                print (file, 'failed to uploaded at partition', k)
        
        print (file, 'was succesfully uploaded to', path)
        
        file_mdata(path, file, filename)
        #add metadata information.
    else:
        print (file, "already exists in", path)
            
        
    return dc
    

In [None]:
filename = 'Stats_Cap_Ind.csv'
path = 'NameNode/root/user'
dc = put(filename, path)


In [13]:
filename = "datasets/Data_Extract_From_Statistical_Capacity_Indicators/42377300-c075-4554-a55f-41cd64c79126_Data.csv"
path = 'NameNode/root/user2'
dc = put(filename, path)

datasets/Data_Extract_From_Statistical_Capacity_Indicators/42377300-c075-4554-a55f-41cd64c79126_Data.csv was succesfully uploaded to NameNode/root/user2


### getPartition

In [3]:
def getPartitionLocation(file):
    path = "NameNode/root/" + file + "/partitions"
    rpath = seek(path)
    partition = requests.get(rpath.url)
    pdict = partition.json()       
    
    return pdict

In [5]:
file = "user/Stats_Cap_Ind"
getPartitionLocation(file)

{'Afghanistan': 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Afghanistan/Stats_Cap_Ind.json',
 'Albania': 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Albania/Stats_Cap_Ind.json',
 'Algeria': 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Algeria/Stats_Cap_Ind.json',
 'Angola': 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Angola/Stats_Cap_Ind.json',
 'Antigua_and_Barbuda': 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Antigua_and_Barbuda/Stats_Cap_Ind.json',
 'Argentina': 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Argentina/Stats_Cap_Ind.json',
 'Armenia': 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Armenia/Stats_Cap_Ind.json',
 'Azerbaijan': 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Azerbaijan/Stats_Cap_Ind.json',
 'Bangladesh': 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Bangl

### readPartition

In [6]:
def readPartition(file, partition):
    pdict = getPartitionLocation(file)
    url = pdict[partition]
    return requests.get(url).json()

In [7]:
readPartition('user/Stats_Cap_Ind', 'Argentina')

{'2004': {'Access_to_water': '1',
  'Agricultural_census': '1',
  'Balance_of_payments_manual_in_use': '1',
  'Child_malnutrition': '0.66667',
  'Child_mortality': '1',
  'Consumer_price_index_base_year': '1',
  'External_debt_reporting_status': '1',
  'Gender_equality': '1',
  'Government_finance_accounting': '1',
  'HIVAIDS': '1',
  'Health_survey': '0',
  'Immunization': '1',
  'Import_and_export_price_indexes': '1',
  'Income_poverty': '1',
  'Industrial_production_index': '1',
  'Maternal_health': '1',
  'Methodology_assessment_of_statistical_capacity_scale_0__100': '100',
  'National_accounts_base_year': '1',
  'National_immunization_coverage': '1',
  'Overall_Average': '92.2222222222222',
  'Per_capita_GDP_growth': '1',
  'Periodicity_and_timeliness_assessment_of_statistical_capacity_scale_0__100': '96.66667',
  'Population_census': '1',
  'Poverty_survey': '1',
  'Primary_completion': '1',
  'Source_data_assessment_of_statistical_capacity_scale_0__100': '80',
  'Special_Data_Di

### CAT

In [8]:
def cat(path):
    file = path.replace('.csv','')
    pdict = getPartitionLocation(file)
    data = dict()
    for k,v in pdict.items():
        data[k] = requests.get(v).json()
    return data
    
    
# df = pd.DataFrame.from_dict(r.json())
# df.insert(0, 'Country', 'Argentina')


In [9]:
file = "user/Stats_Cap_Ind.csv"
data = cat(file)
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,Afghanistan,Albania,Algeria,Angola,Antigua_and_Barbuda,Argentina,Armenia,Azerbaijan,Bangladesh,Belarus,...,Ukraine,Uruguay,Uzbekistan,Vanuatu,Venezuela_RB,Vietnam,West_Bank_and_Gaza,Yemen_Rep,Zambia,Zimbabwe
2004,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '..', 'Agricultural_census...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '..', 'Agricultural_census...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '..', 'Agricultural_census...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
2005,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '..', 'Agricultural_census...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
2006,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '..', 'Agricultural_census...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
2007,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '..', 'Agricultural_census...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
2008,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '..', 'Agricultural_census...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
2009,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
2010,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
2011,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
2012,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
2013,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...",...,"{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'...","{'Access_to_water': '1', 'Agricultural_census'..."
