## DSCI 551 Project

Create an emulation-based system for distributed file storage and parallel computation. <br>
1. Building an emulated distributed file system (EDFS) <br>
- EDFS should support the following commands, similar to that in HDFS:
    - mkdir: create a directory in file system, e.g., *mdkir /user/john
    - ls: listing content of a given directory, e.g., *ls /user
    - cat: display content of a file, e.g., *cat /user/john/hello.txt
    - rm: remove a file from the file system, e.g., *rm /user/john/hello.txt
    - put: uploading file to a file system, e.g., *put (car.csv, /user/john, k = # partitions)* will upload a file cars.csv to the directory /user/john in EDFS. **But note that the file should be stored in k partitions, and the file system should remember where the partitions are stored.** you should design a method to partition data. you may also have the user indicate the method, e.g., hashing on certain car attribute, in the put method. 
    - getPartitionLocations(file): this method will return the location of partitions of the file.
    - readPartition(file, partition #): this method will return the content of partion # of the specified file. the portioned data will be needed in the second task for parallel processing. 
- **Note that EDFS should store the metadata about the file system** (similar to that in NameNode of HDFS, but much simplified). **Metadata include file system structure, attributes of files, and location of partitions storing the contents of files.** You can limit the type of files stored in the file system to certain format, e.g., .csv or JSON. 
<br><br>

#### Google Firebase address : https://dsci551-project-52d43-default-rtdb.firebaseio.com/
### Statistical Capacity Indicators 
###### Statistical Capacity Indicators provides information on various aspects of national statistical systems of developing countries, including an overall country-level statistical capacity indicator. Last Updated:02/03/2021
#### Data from : https://databank.worldbank.org/source/statistical-capacity-indicators# 



In [2]:
import pandas as pd
import numpy as np
import datetime
import requests
import csv
import json
import os
import re
from collections import OrderedDict

firebase_url = 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/'

def seek(path):
    if not re.search('.json', path):
        url = firebase_url + path + '.json'
        
    try:
        rget = requests.get(url)
        return rget
    except:
        print('ERROR')

### MKDIR


In [2]:
def mkdir(path):
    if seek(path).json() is None:
        url = firebase_url + path + '.json'
#         print (url)
        r = requests.put(url,data)
        print (r.url)
    else:
        print ('Directory ', path, ' already exists')

# requests.put('https://dsci551-project-52d43-default-rtdb.firebaseio.com/mk.json', '{"test":1}')

In [None]:
mkdir('NameNode/root/user') #Change to user input

### LS

In [3]:
def ls(path):
    # ADDING "NameNode/root/" to Firebase request path
    if not re.search('NameNode/root', path):
        path = 'NameNode/root/' + path
    
    if seek(path).json() is not None:
        for key in seek(path).json().keys():
            print(key)
    else:
        print (" ")


In [None]:
ls('data2/China') # Change to user input

### RM

In [4]:
def rm(path):
    path = path.replace('.csv','')
    if seek(path).json() is None:
        print ('Directory not found')
    else:
        url = firebase_url + path + '.json'
        d = requests.delete(url)
        if d.status_code == 200:
            print(path, 'was succefully deleted')

In [6]:
rm('NameNode/root/user')
rm('DataNode')


NameNode/root/user was succefully deleted
DataNode was succefully deleted


### PUT

In [11]:
# cleans column names for firebase json object key
def varname (var):
    key = re.sub(r'[^A-Za-z0-9 ]+', '', var).replace(" ", "_")
    names = key if key != "" else "invalid_key"
    return names

def mtime():
#     to revert back
    #datetime.datetime.utcfromtimestamp(int(mtime)/1000).strftime('%Y-%-m-%-d %I:%M:%S') 
    return (datetime.datetime.now().timestamp()*1000)

def filesize(file): #file size in bytes
    return  os.path.getsize(file)

def indexing(dicts):
    dt = dict()
    for k,v in dicts.items():
        i = int(k.replace('p',''))
        dt[i] = v
    return dt

In [15]:

def record_partition(path, country, filename, url):
    try:
        npath = firebase_url + path + "/" + filename + "/partitions.json"
    #     print (npath ,":", url)
        mdata = {country : url}
        putMeta = requests.patch(npath, json.dumps(mdata))
        if putMeta.status_code == 400: print(country)
    #     print (putMeta)
    except:
        print (country)

def file_mdata(path, file, filename):
    npath = firebase_url + path + "/" + filename + ".json"
    mdata = {'ctime': mtime(),
             'name': file,
             'type': 'FILE',
             'filesize':filesize(file)}
    putMeta = requests.patch(npath, json.dumps(mdata))
    

# partition by Country (Original plan)
def put(file, path):
    filename = file.replace(".csv","")
 
    # creating dictinary to organize data into correct json format. 
    # added 'file name' to the dictionary to help differentiate data from different files
    dc = dict()
    with open(file, encoding = 'utf-8') as csvfile:
        csvReader = csv.reader(csvfile)
        
        for index, row in enumerate(csvReader):
            cname = varname(row[0])
            n = 'p' + str(index)
            if cname in dc:
                dc[cname][n] = (';'.join(row))
            else:
                dc[cname]={n:(';'.join(row))}
    
    if seek(path + '/' +filename).json() is None:
        for key, val in dc.items():
            url = firebase_url + 'DataNode/' + key + '/' + filename + '.json'
            putResponse = requests.put(url, json.dumps(val))
            if putResponse.status_code == 200:
                record_partition (path, key, filename, putResponse.url)
            else:
                print (file, 'failed to uploaded at partition', key)
        
        print (file, 'was succesfully uploaded to', path)
        
        file_mdata(path, file, filename)
        #add metadata information.
    else:
        print (file, "already exists in", path)
            
        
    return dc
    

In [16]:
# filename = 'Stats_Cap_Ind.csv'
filename = 'Human_Capital_Index.csv'
path = 'NameNode/root/user'
dc = put(filename, path)


Human_Capital_Index.csv was succesfully uploaded to NameNode/root/user


### getPartition

In [3]:
def getPartitionLocation(file):
    file = file.replace(".csv","")
    path = "NameNode/root/" + file + "/partitions"
    rpath = seek(path)
    partition = requests.get(rpath.url)
    pdict = partition.json()       
    
    return pdict

In [None]:
file = "user/Stats_Cap_Ind"
getPartitionLocation(file)


### readPartition

In [10]:
def readPartition(file, partition):
    pdict = getPartitionLocation(file)
    url = pdict[partition]
    columns = 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Country_Name/Stats_Cap_Ind.json'
    rlist =[ v for k, v in requests.get(columns).json().items()]
    getRead = indexing(requests.get(url).json())
    for key in sorted(getRead):
        rlist.append(getRead[key])
    return rlist
    
#     return requests.get(url).json()

In [13]:
a = readPartition('user/Stats_Cap_Ind', 'China') # returns a list of rows
# print(a)
df = pd.DataFrame(columns = a[0].split(';'), data=[row.split(';') for row in a[1:]])
df

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],...,2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
0,China,CHN,Access to water,5.51.01.09.water,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1,1,1,1,1,1.0,1,1,1
1,China,CHN,Agricultural census,3.01.04.01.agcen,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1,1,1,1,1,1.0,1,1,1
2,China,CHN,Balance of payments manual in use,2.04.01.01.excncpt,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1,1,1,1,1,1.0,1,1,1
3,China,CHN,Child malnutrition,5.51.01.02.malnut,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1,1,1,1,1,1.0,1,1,1
4,China,CHN,Child mortality,5.51.01.03.mortal,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1,1,1,1,1,1.0,1,1,1
5,China,CHN,Consumer price index base year,2.01.03.01.prcpbase,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1,1,1,1,1,1.0,1,1,1
6,China,CHN,External debt reporting status,5.04.01.01.exdebt,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1,1,1,1,1,1.0,1,1,1
7,China,CHN,Gender equality,5.51.01.07.gender,0.33333,0.33333,0.33333,0.33333,0.33333,0.66667,...,0.66667,1,1,1,1,1,1.0,1,1,1
8,China,CHN,Government finance accounting,3.02.01.02.fscov,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1,1,1,1,1,1.0,1,1,1
9,China,CHN,Health survey,5.13.01.01.hlthsurv,0.5,0.5,0.5,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0.0,0,0,0


### CAT

In [16]:
def cat(path):
    file = path.replace('.csv','')
    pdict = getPartitionLocation(file)
    data = dict()
    for k,v in pdict.items():
#         print(v)
        getPartition =requests.get(v).json()
        for key, val in getPartition.items():
            i = int(key.replace('p',''))
            data[i]=val.replace(';',',')
            
# Option 1: sort and return in a list
    ldata = list()
    for key in sorted(data):
        ldata.append(data[key])
    return ldata

# Option2: sort and return in a dictionary
#         data[k] = requests.get(v).json()
#     return (OrderedDict(sorted(data.items())))


def sprint(dct):
    for key in sorted(dct):
        print(dct[key])
#         with open('testcsv.csv','w') as csvOut:
#             csvOut.write(dct[key])
    
# df = pd.DataFrame.from_dict(r.json())



In [17]:
file = "user/Stats_Cap_Ind.csv"
data = cat(file)
print(data)
# sprint(data)
# df = pd.DataFrame.from_dict(data)

['Country Name,Country Code,Series Name,Series Code,2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]', 'Afghanistan,AFG,Access to water,5.51.01.09.water,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1', 'Afghanistan,AFG,Agricultural census,3.01.04.01.agcen,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0', 'Afghanistan,AFG,Balance of payments manual in use,2.04.01.01.excncpt,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1', 'Afghanistan,AFG,Child malnutrition,5.51.01.02.malnut,0.33333,0.33333,0.33333,0.33333,0.66667,0.66667,0.66667,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.66667', 'Afghanistan,AFG,Child mortality,5.51.01.03.mortal,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1', 'Afghanistan,AFG,Consumer price index base year,2.01.03.01.prcpbase,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0', 'Afghanistan,AFG,External debt reporting st

In [None]:
for d in data:
    print (d)

### mapPartition( )

In [4]:
def mapPartition(p):
    columns = 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Country_Name/Stats_Cap_Ind.json'
    rlist =[ v for k, v in requests.get(columns).json().items()]
    readMap = indexing(requests.get(p).json())
    for key in sorted(readMap):
        rlist.append(readMap[key])
    return rlist 
    
# function to get year columns
def is_year (c):
    return any(char.isdigit() for char in c)    

def new_col(cols):
    new_col = list()
    for c in cols:
        if is_year(c):
            new_col.append(c[:4])
        else:
            new_col.append(c)
    return new_col
    
def to_df(data):
    df = pd.DataFrame(columns = data[0].split(';'), data=[row.split(';') for row in data[1:]])
    columns = new_col(df.columns.values)
    df.columns = columns
    df_melted = df.melt(id_vars=columns[:4], var_name='Year', value_name='Value')
    return df_melted

In [7]:
file = "user/Stats_Cap_Ind"
partitions = getPartitionLocation(file)

df_list = list()
for key,dir in partitions.items():
    map = mapPartition(dir)
#     print(map)
    df_list.append(to_df(map))

#maybe a dictionary???

In [17]:
df_list[1]

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,Year,Value
0,Albania,ALB,Access to water,5.51.01.09.water,2004,1
1,Albania,ALB,Agricultural census,3.01.04.01.agcen,2004,1
2,Albania,ALB,Balance of payments manual in use,2.04.01.01.excncpt,2004,1
3,Albania,ALB,Child malnutrition,5.51.01.02.malnut,2004,0.66667
4,Albania,ALB,Child mortality,5.51.01.03.mortal,2004,1
...,...,...,...,...,...,...
488,Albania,ALB,Primary completion,5.51.01.08.primcomp,2020,1
489,Albania,ALB,Source data assessment of statistical capacity...,IQ.SCI.SRCE,2020,90
490,Albania,ALB,Special Data Dissemination Standard,5.21.01.01.sdds,2020,0
491,Albania,ALB,UNESCO reporting,5.12.01.01.unesco,2020,1
