# File format

In [None]:
proj_lang = {
    "proj_a" : {'lang':'Python', 'city':'Stockholm', 'start':2022},
    "proj_b" : {'lang':'C/C++', 'city':'Uppsala', 'start':2023},
    "proj_c" : {'lang':'Julia', 'city':'Linkoping', 'start':2024}
}

In [None]:
type(proj_lang)

In [None]:
data = open('proj_lang_output.txt','w')
data.write(str(proj_lang))
data.close()

In [None]:
with open("proj_lang_output.txt", 'r') as file:
    for content in file:
        print(content)

In [None]:
#for k in content.items(): # gets an error when we try to access its keys or values
#    print(k, v)

type(content) # because the nested dictionary is now being printed as a string

# Pickle

In [None]:
import pickle # pickle transform data/object to byte-string

print(pickle.dumps(proj_lang)) # the output of `dumps` is byte string

In [None]:
print(proj_lang, end='\n\n')
print(type(proj_lang))

In [None]:
with open('proj_lang_pickle_output.pkl', 'wb') as fileout:  # extension does not have to be .pkl
    pickle.dump(proj_lang, fileout) # serialize the dictionary

In [None]:
try:
    filein = open('proj_lang_pickle_output.pkl', 'rb')
finally:
    temp = pickle.load(filein)
    print(temp, end='\n\n')
    print(type(temp), end='\n\n')
    for k, v in temp.items():
        print(k, v, sep=' ||| ')
    filein.close()

In [None]:
print(proj_lang == temp)

# Timing for writting and reading CSV and Pickle files

In [None]:
# serialize and deserialize the Numpy arrays

import numpy as np

numpy_array = np.ones((10,10)) # a 10x10 array

with open('numpy_array.pkl','wb') as fileout:
    pickle.dump(numpy_array, fileout)

with open('numpy_array.pkl','rb') as filein:
    unpickled_array = pickle.load(filein)
    print('Array shape: '+str(unpickled_array.shape))
    print('Data type: '+str(type(unpickled_array)))

In [None]:
# serialize and deserialize a Pandas dataframe

import pandas as pd
import numpy as np

np.random.seed(123)

data = {'Column1': np.random.randint(0, 10, size=100000),
        'Column2': np.random.choice(['A', 'B', 'C'], size=100000),
        'Column3': np.random.rand(100000)}

df = pd.DataFrame(data) # create Pandas dataframe

In [None]:
# timing for saving this dataframe as a csv file and as a pickle file

import time

start = time.time()
df.to_csv('pandas_dataframe.csv')
end = time.time()

print("Elapsed time for saving a csv file :", end - start)

start = time.time()
df.to_pickle("pandas_dataframe.pkl")
end = time.time()

print("Elapsed time for pickle            :", end - start)

In [None]:
# time for reading csv and pickle files

# Reading csv file into Pandas:

start = time.time()
df_csv = pd.read_csv("pandas_dataframe.csv")
end = time.time()
print("Time taken to read the csv file:   ", end-start)

# Reading pickle file into Pandas:

start = time.time()
df_pkl = pd.read_pickle("pandas_dataframe.pkl")
end = time.time()
print("Time taken to read the Pickle file: ", end-start)