## DSCI 551 Project

Create an emulation-based system for distributed file storage and parallel computation. <br>
1. Building an emulated distributed file system (EDFS) <br>
- EDFS should support the following commands, similar to that in HDFS:
    - mkdir: create a directory in file system, e.g., *mdkir /user/john
    - ls: listing content of a given directory, e.g., *ls /user
    - cat: display content of a file, e.g., *cat /user/john/hello.txt
    - rm: remove a file from the file system, e.g., *rm /user/john/hello.txt
    - put: uploading file to a file system, e.g., *put (car.csv, /user/john, k = # partitions)* will upload a file cars.csv to the directory /user/john in EDFS. **But note that the file should be stored in k partitions, and the file system should remember where the partitions are stored.** you should design a method to partition data. you may also have the user indicate the method, e.g., hashing on certain car attribute, in the put method. 
    - getPartitionLocations(file): this method will return the location of partitions of the file.
    - readPartition(file, partition #): this method will return the content of partion # of the specified file. the portioned data will be needed in the second task for parallel processing. 
- **Note that EDFS should store the metadata about the file system** (similar to that in NameNode of HDFS, but much simplified). **Metadata include file system structure, attributes of files, and location of partitions storing the contents of files.** You can limit the type of files stored in the file system to certain format, e.g., .csv or JSON. 
<br><br>

#### Google Firebase address : https://dsci551-project-52d43-default-rtdb.firebaseio.com/
### Statistical Capacity Indicators 
###### Statistical Capacity Indicators provides information on various aspects of national statistical systems of developing countries, including an overall country-level statistical capacity indicator. Last Updated:02/03/2021
#### Data from : https://databank.worldbank.org/source/statistical-capacity-indicators# 




In [1]:
import pandas as pd
import requests
import json

firebase_url = 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/'

In [18]:
def seek(path):
    url = firebase_url + path + '.json'
    try:
        rget = requests.get(url)
        return rget
    except:
        print('ERROR')
        
        
        
def ls(path):
    slist = seek(path)
    rlist = slist.json()

    result_list = []
    
    for key in rlist.keys():
        result_list.append(key)
    
    return result_list


        
def mkdir(path):
    if seek(path).json() is None:
        url = firebase_url + path + '.json'
        data = '{"test2":"2"}'
#         print (url)
        r = requests.put(url,data)
        print (r.url)
    else:
        print ('Directory ', path, ' already exists')

        
def rm(path):
    if seek(path).json() is None:
        print ('Directory not found')
    else:
        url = firebase_url + path + '.json'
        d = requests.delete(url)
        if d.status_code == 200:
            print(path, 'was succefully deleted')

In [None]:
# function to get year columns
def is_year (c):
    return any(char.isdigit() for char in c)

def record_partition(data, path):
    for key in data.keys():
        durl = seek('DataNode2/'+ key)
        
        if seek(path+'/'+key).json() is None:
            murl = firebase_url +path+'/'+key+'.json'
            mdict = {key:durl.url}
            putMeta = requests.put(murl, json.dumps(mdict))

# partition by Country (Original plan)
def put(file, path):
    data = pd.read_csv(file).dropna()
    
    # change columns names
    new_columns = list()
    columns = data.columns
    for c in columns:
        if is_year(c):
            new_columns.append(c[:4])
        else:
            new_columns.append(c.replace(" ","_"))
    
    # change column names in dataframe
    data.columns = new_columns

    # creating a list of country names, country code, series name, series code, and years of the data collected
    cname = data.Country_Name.unique().tolist()
    sname = data.Series_Name.tolist()
    years = [n for n in data.columns if n.isnumeric()]

    # replacement of symbols that are invalid in Firebase
    cname2 = [sub.replace ('.','') for sub in cname]
    sname2 = [sub.replace('/','-') for sub in sname]

    # creating dictinary to organize data into correct json format. 
        # added 'file name' to the dictionary to help differentiate data from different files
    dc = dict()
    for country in cname2:
        dc[country]={'Statistical_Capacity_Indicators':dict(zip(years, [dict()]*len(years)))}
        t = data[data['Country_Name'] == country]    
        for year in years:
            dc[country]['Statistical_Capacity_Indicators'][year]=dict(zip(sname2,t[year]))
            
    if seek(path).json() is None:
        url = firebase_url + 'DataNode2' + '.json'
        putResponse = requests.put(url, json.dumps(dc))
        if putResponse.status_code == 200:

            record_partition(dc, path)
            
            print(file, 'was succefully uploaded')
        else:
            print ('Error')
    else: 
        print (file, "already exists in", path)
    return dc
    

### partition by k = # partitions

In [None]:

# function to get year columns
def is_year (c):
    return any(char.isdigit() for char in c)


# partition by Country (Original plan)
def put(file, path, k=None):
    data = pd.read_csv(file).dropna()
    
    # change columns names
    new_columns = list()
    columns = data.columns
    for c in columns:
        if is_year(c):
            new_columns.append(c[:4])
        else:
            new_columns.append(c.replace(" ","_"))
    
    # change column names in dataframe
    data.columns = new_columns

    # creating a list of country names, country code, series name, series code, and years of the data collected
    cname = data.Country_Name.unique().tolist()
    sname = data.Series_Name.tolist()
    years = [n for n in data.columns if n.isnumeric()]

    # replacement of symbols that are invalid in Firebase
    cname2 = [sub.replace ('.','') for sub in cname]
    sname2 = [sub.replace('/','-') for sub in sname]
    
    if k is None:
        k = len(cname2)
        
    knames = np.array_split(cname2, k)
#     print (knames[0])
#     print (','.join(knames[0]))

    # creating dictinary to organize data into correct json format. 
        # added 'file name' to the dictionary to help differentiate data from different files
        
    dc = dict()
    for kname in knames:
        kp = ','.join(kname)
        dc[kp]=dict()
        for country in kname:
            dc[kp][country]={'Statistical_Capacity_Indicators':dict(zip(years, [dict()]*len(years)))}
            t = data[data['Country_Name'] == country]    
            for year in years:
                dc[kp][country]['Statistical_Capacity_Indicators'][year]=dict(zip(sname2,t[year]))
            
    if seek(path).json() is None:
        url = firebase_url + 'K-partition' + '.json'
        putResponse = requests.put(url, json.dumps(dc))
        if putResponse.status_code == 200:
            print(file, 'was succefully uploaded')
        else:
            print ('Error')
    else: 
        print (file, "already exists in", path)
    return dc


In [None]:
'''
1. need to get the partition locations
2. read partitions
3. cat... waiting for Data format
'''