In [12]:
from edfs.firebase import ls, \
    mkdir, rm


KeyboardInterrupt: 

In [None]:
import pandas as pd
import requests
import json

firebase_url = 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/'

def seek(path):
    url = firebase_url + path + '.json'
    try:
        rget = requests.get(url)
        return rget
    except:
        print('ERROR')
        

def ls(path: str) -> str:
    '''List files under path/.

    Args:
        path (str): path starting from NameNode/
    Returns:
        (str) Success or Error message
    '''
    slist = seek(f"NameNode/{path}")
    rlist = slist.json()

    result_list = []

    if type(rlist) == dict: # iterate over rlist if rlist != None
        for key, value in rlist.items():
            if key == "_": # empty directory
                continue
            if type(value) == dict: # if item is folder, add '/' to the end of thge string
                result_list.append(key + "/")
            else:
                result_list.append(key)
        output = 'empty' if not result_list else ', '.join(result_list)
    elif not rlist:
        output = f'Path {path} not found'
    else: 
        output = f'{path} is not a folder'
    
    return output

def mkdir(path: str) -> str:
    '''Create directory if not exists

    Args:
        path: relative path to NameNode/
    Returns:
        (str) Success or Error message
    '''
    full_path = f'NameNode/{path}'
    if seek(full_path).json() is None:
        url = firebase_url + full_path + '.json'
        data = '{"_" : "_"}' # empty directory
        r = requests.put(url,data)
        output = f'Directory {path} created'
    else:
        output  = 'Directory ' + path + ' already exists'
    return output

In [None]:
def rm(path: str) -> str:
    '''Delete directory if exists

    Args:
        path: relative path to NameNode/
    Returns:
        (str) Success or Error message
    '''
    full_path = f'NameNode/{path}'
    if seek(full_path).json() is None:
        output = 'Directory not found'
    else:
        url = firebase_url + full_path + '.json'
        d = requests.delete(url)
        if d.status_code == 200:
            output = path + ' was succefully deleted'
    return output

In [None]:
def getPartitionLocation(file: str) -> str:
    '''Return the locations of partitions of the
        file
    Args:
        file (str): relative path to NameNode/
    Returns:
        (str) Success or Error message
    '''
    path = "NameNode/" + file + "/partitions"
    rpath = seek(path)
    partition = requests.get(rpath.url)
    pdict = partition.json()       

    if pdict is None:
        output = f'Partitions for {file} not found'
    else:
        output = json.dumps(pdict, indent=4, sort_keys=True) # Organizing the data
    
    return output

In [None]:
functions = {
    'ls': ls, 'mkdir': mkdir, 'rm': rm,
    'readPartition': readPartition,
    'getPartitionLocation': getPartitionLocation
}

In [None]:
input_text = 'readPartition root/user/Stats_Cap_Ind Argentina'
input = input_text.split(' ')
function_name = input[0]
params = input[1:]

In [None]:
# functions[function_name](*params)

In [None]:
getPartitionLocation("Stats_Cap_Ind")

'Partitions for Stats_Cap_Ind not found'

In [None]:
def readPartition(file, partition) -> str:
    '''Return the content of partition # of
    the specified file
    Args:
        file (str): relative path to NameNode/
        partition (str):  name of the partition
    Returns:
        (str) Success or Error message
    '''
    try:
        pdict = json.loads(getPartitionLocation(file))
        url = pdict[partition]
        pdict = requests.get(url).json()
        output = json.dumps(pdict, indent=4, sort_keys=True)
    except:
        output = 'Partition not found'
    return output

In [None]:
def readUrl(url) -> json:
    return requests.get(url).json()

In [None]:
a = readUrl("https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Afghanistan/Stats_Cap_Ind.json")

In [None]:
file = "root/user/dasdasd"
path = "NameNode/" + file + "/partitions"
rpath = seek(path)

In [None]:
rpath.json()

In [None]:
a = "\n".join("{}\t{}".format(k, v) for k, v in result.items())

In [None]:
def search_partitions(country_name: list, file_name: str, series_name: str):
    list_partitions = []
    partitions = getPartitionLocation(f"root/user/{file_name}")
    partitions = json.loads(partitions)
    for country in country_name:
        list_partitions.append(partitions.get(country))
    return list_partitions

In [None]:
from functools import reduce

In [None]:
def _sum(a, b):
    return a + b

def _sum2(a, b, c):
    return a + b

def _count(a, b):
    return a + 1

In [None]:
def readUrl(url, filter_year: list = [], index:str = 'year') -> json:

    filter_str = ''
    if filter_year:
        filter_str = f"""?orderBy="{index}"&startAt={filter_year[0]}&endAt={filter_year[1]}"""
    return requests.get(f"{url}{filter_str}").json()

In [None]:
readUrl('https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Afghanistan/Stats_Cap_Ind.json', [2005, 2007])

{'error': 'Index not defined, add ".indexOn": "year", for path "/DataNode/Afghanistan/Stats_Cap_Ind", to the rules'}

In [None]:
def map_partitions(partitions: list):
    list_of_result = []
    for partition in partitions:
        # print(partition)
        # result = sum(partition)
        result = reduce(_sum, partition)
        list_of_result.append(result)
    return list_of_result


In [None]:
map_partitions([[1,2,3,1], [1,1,1,1]])

[7, 4]

In [None]:
# reduce(_sum2, [1,2,3])

In [None]:
reduce(_count, [[1,2,3,1], [1,1,1,1]], 0)

2

In [None]:
search_partitions(["Afghanistan", "Azerbaijan"], 'Stats_Cap_Ind', None)

['https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Afghanistan/Stats_Cap_Ind.json',
 'https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Azerbaijan/Stats_Cap_Ind.json']

In [None]:
import json
a = json.dumps(result, indent=4, sort_keys=True)

In [None]:
file = "root/user/Stats_Cap_Ind"
result = getPartitionLocation(file)

In [None]:
result

'{\n    "Afghanistan": "https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Afghanistan/Stats_Cap_Ind.json",\n    "Albania": "https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Albania/Stats_Cap_Ind.json",\n    "Algeria": "https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Algeria/Stats_Cap_Ind.json",\n    "Angola": "https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Angola/Stats_Cap_Ind.json",\n    "Antigua_and_Barbuda": "https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Antigua_and_Barbuda/Stats_Cap_Ind.json",\n    "Argentina": "https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Argentina/Stats_Cap_Ind.json",\n    "Armenia": "https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Armenia/Stats_Cap_Ind.json",\n    "Azerbaijan": "https://dsci551-project-52d43-default-rtdb.firebaseio.com/DataNode/Azerbaijan/Stats_Cap_Ind.json",\n    "Bangladesh": "https://dsci551-project-52d43-def

In [None]:
dict_1 =json.loads(result)

In [None]:
rm("root/daniel")

'root/daniel was succefully deleted'

In [None]:
mkdir("data2/China")

'Directory data2/China already exists'

In [None]:
input_text = "ls dsdadsada dasd"
input = input_text.split(' ')

In [None]:
input_text = "ls root/daniel/test2"

In [None]:
functions = {'ls': ls, 'mkdir': mkdir, 'rm': rm}

input = input_text.split(' ')
function_name = input[0]
params = input[1:]

try:
    output = functions[function_name](*params)
except KeyError:
    output = "Command not found. Valid commands: ls, mkdir, rm, put, getPartitionLocations, readPartition"
except TypeError:
    output = "Please verify function required arguments"
print(output)

empty


In [None]:
functions[function_name](*params)

['user']

In [None]:
args = (2,3)

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("datasets/Data_Extract_From_Statistical_Capacity_Indicators/42377300-c075-4554-a55f-41cd64c79126_Data.csv")

In [None]:
# function to get year columns
def is_year (c):
    return any(char.isdigit() for char in c)

# change columns names
new_columns = list()
columns = df.columns
for c in columns:
    if is_year(c):
        new_columns.append(c[:4])
    else:
        new_columns.append(c.replace(" ","_"))

# change column names in dataframe
df.columns = new_columns

In [None]:
df = df.drop(columns=['Country_Code', 'Series_Code'], axis=1)

In [None]:
df

Unnamed: 0,Country_Name,Series_Name,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004
0,Afghanistan,Access to water,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,Afghanistan,Agricultural census,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Afghanistan,Balance of payments manual in use,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0
3,Afghanistan,Child malnutrition,0.66667,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.66667,0.66667,0.66667,0.33333,0.33333,0.33333,0.33333
4,Afghanistan,Child mortality,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4461,Zimbabwe,Primary completion,0,0.33333,0.33333,0.33333,0.33333,0.33333,0,0,0,0,0,0,0,0,0,0,0
4462,Zimbabwe,Source data assessment of statistical capacity...,60,60,60,60,60,60,60,60,60,60,60,50,40,40,50,50,50
4463,Zimbabwe,Special Data Dissemination Standard,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4464,Zimbabwe,UNESCO reporting,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0


In [None]:
df.loc[
    (df['Country_Name'] == 'Afghanistan') & (df['Series_Name'] == 'Child malnutrition')]

Unnamed: 0,Country_Name,Series_Name,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004
3,Afghanistan,Child malnutrition,0.66667,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.33333,0.66667,0.66667,0.66667,0.33333,0.33333,0.33333,0.33333


In [None]:
df_melted = df.melt(id_vars=["Country_Name", "Series_Name"], 
        var_name="Year", 
        value_name="Value")

In [None]:
df_melted['Year'] = df_melted['Year'].astype(int)

In [None]:
import numpy as np

In [None]:
df_melted = df_melted.loc[df_melted['Value'] != '..'].copy()

In [None]:
df_melted.loc[df_melted['Value'] != '..']['Value'].astype(float)

0         1.00000
1         0.00000
2         1.00000
3         0.66667
4         1.00000
           ...   
75917     0.00000
75918    50.00000
75919     0.00000
75920     0.00000
75921     0.00000
Name: Value, Length: 69959, dtype: float64

In [None]:
df_melted['Value'] = df_melted['Value'].astype(float)

In [None]:
df_melted.loc[
    (df_melted['Country_Name'] == 'Afghanistan') & (df_melted['Series_Name'] == 'Child malnutrition')]

Unnamed: 0,Country_Name,Series_Name,Year,Value
3,Afghanistan,Child malnutrition,2020,0.66667
4469,Afghanistan,Child malnutrition,2019,0.33333
8935,Afghanistan,Child malnutrition,2018,0.33333
13401,Afghanistan,Child malnutrition,2017,0.33333
17867,Afghanistan,Child malnutrition,2016,0.33333
22333,Afghanistan,Child malnutrition,2015,0.33333
26799,Afghanistan,Child malnutrition,2014,0.33333
31265,Afghanistan,Child malnutrition,2013,0.33333
35731,Afghanistan,Child malnutrition,2012,0.33333
40197,Afghanistan,Child malnutrition,2011,0.33333


In [None]:
df_melted.loc[
    (df_melted['Country_Name'] == 'Afghanistan')]# & (df_melted['Series_Name'] == 'Child malnutrition')]

Unnamed: 0,Country_Name,Series_Name,Year,Value
0,Afghanistan,Access to water,2020,1
1,Afghanistan,Agricultural census,2020,0
2,Afghanistan,Balance of payments manual in use,2020,1
3,Afghanistan,Child malnutrition,2020,0.66667
4,Afghanistan,Child mortality,2020,1
...,...,...,...,...
71480,Afghanistan,Primary completion,2004,0
71481,Afghanistan,Source data assessment of statistical capacity...,2004,20
71482,Afghanistan,Special Data Dissemination Standard,2004,0
71483,Afghanistan,UNESCO reporting,2004,0


In [None]:
mask = (
    (df_melted.Country_Name.isin(['Albania']))
    & (df_melted.Series_Name == 'Access to water')
    & (df_melted.Year >= 2000)
    & (df_melted.Year <= 2020)
)

In [None]:
data_list = []
filtered_dataset = df_melted[mask].copy()
for country_name in filtered_dataset.Country_Name.unique():
    df_temp = filtered_dataset[filtered_dataset['Country_Name'] == country_name]
    dict_append = {
        "x": df_temp["Year"],
        "y": df_temp["Value"],
        "type": "lines",
        "hovertemplate": "%{y:.2f}<extra></extra>",
        "name": country_name
    }
    data_list.append(dict_append)

In [None]:
df_melted.loc[df_melted['Series_Name'] == 'Access to water'].groupby('Year').agg({'Value': 'median'})

Unnamed: 0_level_0,Value
Year,Unnamed: 1_level_1
2004,1.0
2005,1.0
2006,1.0
2007,1.0
2008,1.0
2009,1.0
2010,1.0
2011,1.0
2012,1.0
2013,1.0


In [None]:
df_melted.loc[(df_melted.Country_Name == 'Afghanistan')
        & (df_melted.Series_Name == 'Access to water')
        & (df_melted.Year >= 2000)
        & (df_melted.Year <= 2020)]

Unnamed: 0,Country_Name,Series_Name,Year,Value
0,Afghanistan,Access to water,2020,1
4466,Afghanistan,Access to water,2019,1
8932,Afghanistan,Access to water,2018,1
13398,Afghanistan,Access to water,2017,1
17864,Afghanistan,Access to water,2016,1
22330,Afghanistan,Access to water,2015,1
26796,Afghanistan,Access to water,2014,1
31262,Afghanistan,Access to water,2013,1
35728,Afghanistan,Access to water,2012,1
40194,Afghanistan,Access to water,2011,1


# Spark

In [1]:
from pyspark.sql import SparkSession

In [2]:
data_list

NameError: name 'data_list' is not defined

In [3]:
# spark = SparkSession.builder.appName('DataLake_Ingestion').getOrCreate()

In [4]:
spark = SparkSession.builder.master("spark://3.82.248.62:7077").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/15 21:28:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
rdd = spark.sparkContext.parallelize([1,2,1,1,11])

In [6]:
rdd.count()

[Stage 0:>                                                          (0 + 0) / 2]

22/11/15 21:29:35 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:29:50 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:30:05 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:30:20 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 0:>                                                          (0 + 0) / 2]

22/11/15 21:30:35 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:30:50 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:31:05 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:31:20 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 0:>                                                          (0 + 0) / 2]

22/11/15 21:31:35 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:31:50 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:32:05 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:32:20 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 0:>                                                          (0 + 0) / 2]

22/11/15 21:32:35 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:32:50 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:33:05 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:33:20 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 0:>                                                          (0 + 0) / 2]

22/11/15 21:33:35 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:33:50 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:34:05 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:34:20 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 0:>                                                          (0 + 0) / 2]

22/11/15 21:34:35 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:34:50 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:35:05 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:35:20 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 0:>                                                          (0 + 0) / 2]

22/11/15 21:35:35 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:35:50 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:36:05 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:36:20 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 0:>                                                          (0 + 0) / 2]

22/11/15 21:36:35 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:36:50 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:37:05 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/11/15 21:37:20 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


ERROR:root:KeyboardInterrupt while sending command.                 (0 + 0) / 2]
Traceback (most recent call last):
  File "/Users/danieldacosta/miniconda3/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/danieldacosta/miniconda3/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/danieldacosta/miniconda3/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 