# Data Analysis - Introduction to Pandas

**Author**: [Gabriele Pompa](https://www.linkedin.com/in/gabrielepompa/): gabriele.pompa@unisi.com

# Table of contents

[Executive Summary](#summary)

**TODO**

### **Resources**: 

**TODO**

# Executive Summary <a name="summary"></a>

**TODO**

These are the basic imports that we need to work with NumPy, Pandas and to plot data using Matplotlib functionalities

In [1]:
# for NumPy arrays
import numpy as np

# for Pandas Series and DataFrame
import pandas as pd

# for Matplotlib plotting
import matplotlib.pyplot as plt

# to do inline plots in the Notebook
%matplotlib inline

[OS - Operating System Interfaces](https://docs.python.org/3/library/os.html)

In [2]:
# to create delete directories
import os

In [3]:
# ".." means one directory above in the directory tree
# therefore, since we are in the directory "IT_For_Business_And_Finance_2019_20/Notebooks",
# "../Data/" is equivalent to "IT_For_Business_And_Finance_2019_20/Data"

dataFolderPath = "../Data/"

if not os.path.exists(dataFolderPath):
    os.makedirs(dataFolderPath)

---

## Pickle [https://docs.python.org/3/library/pickle.html](https://docs.python.org/3/library/pickle.html)

In [4]:
rows = int(1e6)

In [5]:
mat = np.array([[i*k for i in range(1,rows+1)] for k in range(1,6)]).T

In [6]:
mat

array([[      1,       2,       3,       4,       5],
       [      2,       4,       6,       8,      10],
       [      3,       6,       9,      12,      15],
       ...,
       [ 999998, 1999996, 2999994, 3999992, 4999990],
       [ 999999, 1999998, 2999997, 3999996, 4999995],
       [1000000, 2000000, 3000000, 4000000, 5000000]])

In [7]:
mat.shape

(1000000, 5)

In [8]:
mat.dtype

dtype('int32')

[open() function](https://docs.python.org/3/tutorial/inputoutput.html#reading-and-writing-files)

In [9]:
import pickle

[with statement](https://www.geeksforgeeks.org/with-statement-in-python/)

In [10]:
with open(dataFolderPath + "mat.pkl", 'wb') as file:
    %time pickle.dump(mat, file)

Wall time: 29.1 ms


In [11]:
type(file)

_io.BufferedWriter

In [12]:
file.closed

True

In [13]:
with open(dataFolderPath + "mat.pkl", 'rb') as file:
    %time mat_reloaded = pickle.load(file)

Wall time: 22.9 ms


In [14]:
file.closed

True

In [15]:
mat_reloaded

array([[      1,       2,       3,       4,       5],
       [      2,       4,       6,       8,      10],
       [      3,       6,       9,      12,      15],
       ...,
       [ 999998, 1999996, 2999994, 3999992, 4999990],
       [ 999999, 1999998, 2999997, 3999996, 4999995],
       [1000000, 2000000, 3000000, 4000000, 5000000]])

remove file [os.remove()](https://docs.python.org/3/library/os.html#os.remove)

In [16]:
if os.path.isfile(dataFolderPath + "mat.pkl"):
    os.remove(dataFolderPath + "mat.pkl")

# double-check if file still exists
os.path.isfile(dataFolderPath + "mat.pkl")

False

multiple objects

In [17]:
mat_dict = {'mat': mat,
            'mat_squared': mat**2}

In [18]:
mat_dict['mat']

array([[      1,       2,       3,       4,       5],
       [      2,       4,       6,       8,      10],
       [      3,       6,       9,      12,      15],
       ...,
       [ 999998, 1999996, 2999994, 3999992, 4999990],
       [ 999999, 1999998, 2999997, 3999996, 4999995],
       [1000000, 2000000, 3000000, 4000000, 5000000]])

In [19]:
mat_dict['mat_squared']

array([[          1,           4,           9,          16,          25],
       [          4,          16,          36,          64,         100],
       [          9,          36,          81,         144,         225],
       ...,
       [ -731379964,  1369447440,  2007514916,  1182822464, -1104629916],
       [ -729379967,  1377447428,  2025514889,  1214822416, -1054629991],
       [ -727379968,  1385447424,  2043514880,  1246822400, -1004630016]],
      dtype=int32)

In [20]:
with open(dataFolderPath + "mat_dict.pkl", 'wb') as file:
    %time pickle.dump(mat_dict, file)

Wall time: 58.3 ms


In [21]:
with open(dataFolderPath + "mat_dict.pkl", 'rb') as file:
    %time mat_dict_reloaded = pickle.load(file)

Wall time: 43.9 ms


In [22]:
mat_dict_reloaded['mat']

array([[      1,       2,       3,       4,       5],
       [      2,       4,       6,       8,      10],
       [      3,       6,       9,      12,      15],
       ...,
       [ 999998, 1999996, 2999994, 3999992, 4999990],
       [ 999999, 1999998, 2999997, 3999996, 4999995],
       [1000000, 2000000, 3000000, 4000000, 5000000]])

In [23]:
mat_dict_reloaded['mat_squared']

array([[          1,           4,           9,          16,          25],
       [          4,          16,          36,          64,         100],
       [          9,          36,          81,         144,         225],
       ...,
       [ -731379964,  1369447440,  2007514916,  1182822464, -1104629916],
       [ -729379967,  1377447428,  2025514889,  1214822416, -1054629991],
       [ -727379968,  1385447424,  2043514880,  1246822400, -1004630016]])

In [24]:
if os.path.isfile(dataFolderPath + "mat_dict.pkl"):
    os.remove(dataFolderPath + "mat_dict.pkl")

# double-check if file still exists
os.path.isfile(dataFolderPath + "mat_dict.pkl")

False

---

---

## JSON [https://docs.python.org/3/tutorial/inputoutput.html#saving-structured-data-with-json)

In [25]:
refData = {
    'S&P Rating': ['A', 'BB', 'AA', 'CCC'],
    'Spread': [100, 300, 70, 700],
    'Country': ['USA', 'ITA', 'UK', 'ITA']
}

In [26]:
import json

In [27]:
with open(dataFolderPath + "refData.json", 'w') as file:
    %time json.dump(refData, file, indent="\t")

Wall time: 0 ns


In [28]:
type(file)

_io.TextIOWrapper

In [29]:
file.closed

True

In [30]:
with open(dataFolderPath + "refData.json", 'r') as file:
    %time refData = json.load(file)

Wall time: 998 µs


In [31]:
file.closed

True

In [32]:
refData

{'S&P Rating': ['A', 'BB', 'AA', 'CCC'],
 'Spread': [100, 300, 70, 700],
 'Country': ['USA', 'ITA', 'UK', 'ITA']}

In [33]:
if os.path.isfile(dataFolderPath + "refData.json"):
    os.remove(dataFolderPath + "refData.json")

# double-check if file still exists
os.path.isfile(dataFolderPath + "refData.json")

False

---

---

---

In [None]:
# for Yahoo Finance API
import yfinance as yf

In [None]:
data = yf.download("^GSPC", period="max")

In [None]:
data.loc['2010-01-01':, 'High'].plot()

In [None]:
data.head()

In [None]:
spx = yf.Ticker("^GSPC")
spx_hist = spx.history(period="max")

In [None]:
spx_hist.tail()

In [None]:
data2 = yf.download("SPY AAPL", start="2017-01-01", end="2017-04-30", group_by = 'ticker')

In [None]:
data2.head()

In [None]:
data2['SPY']