# File format

In [1]:
proj_lang = {
    "proj_a" : {'lang':'Python', 'city':'Stockholm', 'start':2022},
    "proj_b" : {'lang':'C/C++', 'city':'Uppsala', 'start':2023},
    "proj_c" : {'lang':'Julia', 'city':'Linkoping', 'start':2024}
}

In [2]:
type(proj_lang)

dict

In [3]:
data = open('proj_lang_output.txt','w')
data.write(str(proj_lang))
data.close()

In [4]:
with open("proj_lang_output.txt", 'r') as file:
    for content in file:
        print(content)

{'proj_a': {'lang': 'Python', 'city': 'Stockholm', 'start': 2022}, 'proj_b': {'lang': 'C/C++', 'city': 'Uppsala', 'start': 2023}, 'proj_c': {'lang': 'Julia', 'city': 'Linkoping', 'start': 2024}}


In [5]:
#for k in content.items(): # gets an error when we try to access its keys or values
#    print(k, v)

type(content) # because the nested dictionary is now being printed as a string

str

# Pickle

In [6]:
import pickle # pickle transform data/object to byte-string

print(pickle.dumps(proj_lang)) # the output of `dumps` is byte string

b'\x80\x04\x95\x92\x00\x00\x00\x00\x00\x00\x00}\x94(\x8c\x06proj_a\x94}\x94(\x8c\x04lang\x94\x8c\x06Python\x94\x8c\x04city\x94\x8c\tStockholm\x94\x8c\x05start\x94M\xe6\x07u\x8c\x06proj_b\x94}\x94(h\x03\x8c\x05C/C++\x94h\x05\x8c\x07Uppsala\x94h\x07M\xe7\x07u\x8c\x06proj_c\x94}\x94(h\x03\x8c\x05Julia\x94h\x05\x8c\tLinkoping\x94h\x07M\xe8\x07uu.'


In [7]:
print(proj_lang, end='\n\n')
print(type(proj_lang))

{'proj_a': {'lang': 'Python', 'city': 'Stockholm', 'start': 2022}, 'proj_b': {'lang': 'C/C++', 'city': 'Uppsala', 'start': 2023}, 'proj_c': {'lang': 'Julia', 'city': 'Linkoping', 'start': 2024}}

<class 'dict'>


In [8]:
with open('proj_lang_pickle_output.pkl', 'wb') as fileout:  # extension does not have to be .pkl
    pickle.dump(proj_lang, fileout) # serialize the dictionary

In [9]:
try:
    filein = open('proj_lang_pickle_output.pkl', 'rb')
finally:
    temp = pickle.load(filein)
    print(temp, end='\n\n')
    print(type(temp), end='\n\n')
    for k, v in temp.items():
        print(k, v, sep=' ||| ')
    filein.close()

{'proj_a': {'lang': 'Python', 'city': 'Stockholm', 'start': 2022}, 'proj_b': {'lang': 'C/C++', 'city': 'Uppsala', 'start': 2023}, 'proj_c': {'lang': 'Julia', 'city': 'Linkoping', 'start': 2024}}

<class 'dict'>

proj_a ||| {'lang': 'Python', 'city': 'Stockholm', 'start': 2022}
proj_b ||| {'lang': 'C/C++', 'city': 'Uppsala', 'start': 2023}
proj_c ||| {'lang': 'Julia', 'city': 'Linkoping', 'start': 2024}


In [10]:
print(proj_lang == temp)

True


# Timing for writting and reading CSV and Pickle files

In [11]:
# serialize and deserialize the Numpy arrays

import numpy as np

numpy_array = np.ones((10,10)) # 10x10 array

with open('numpy_array.pkl','wb') as fileout:
    pickle.dump(numpy_array, fileout)

with open('numpy_array.pkl','rb') as filein:
    unpickled_array = pickle.load(filein)
    print('Array shape: '+str(unpickled_array.shape))
    print('Data type: '+str(type(unpickled_array)))

Array shape: (10, 10)
Data type: <class 'numpy.ndarray'>


In [12]:
# serialize and deserialize a Pandas dataframe

import pandas as pd
import numpy as np

# Set random seed
np.random.seed(123)

data = {'Column1': np.random.randint(0, 10, size=100000),
        'Column2': np.random.choice(['A', 'B', 'C'], size=100000),
        'Column3': np.random.rand(100000)}

df = pd.DataFrame(data) # create Pandas dataframe

In [13]:
# timing for saving this dataframe as a csv file and as a pickle file

import time

start = time.time()
df.to_csv('pandas_dataframe.csv')
end = time.time()

print("Elapsed time for saving a csv file :", end - start)

start = time.time()
df.to_pickle("pandas_dataframe.pkl")
end = time.time()

print("Elapsed time for pickle            :", end - start)

Elapsed time for saving a csv file : 0.1548299789428711
Elapsed time for pickle            : 0.002086162567138672


In [14]:
# time for reading csv and pickle files

# Reading the csv file into Pandas:

start = time.time()
df_csv = pd.read_csv("pandas_dataframe.csv")
end = time.time()
print("Time taken to read the csv file:   ", end-start)

# Reading the Pickle file into Pandas:

start = time.time()
df_pkl = pd.read_pickle("pandas_dataframe.pkl")
end = time.time()
print("Time taken to read the Pickle file: ", end-start)

Time taken to read the csv file:    0.016301870346069336
Time taken to read the Pickle file:  0.0019116401672363281
