In [2]:
#----------------------------------------------------------------------------------------------------------------------
#
#                                           INTRODUCTION TO PYTHON
# 
# 02. - PANDAS (PART I)
#----------------------------------------------------------------------------------------------------------------------
import numpy  as np
import pandas as pd

pandasis a Python package providing fast, flexible, and expressive data structures designed to make working
with “relational” or “labeled” data both easy and intuitive. It aims to be the fundamental high-level building block
for doing practical, real world data analysis in Python.

https://pandas.pydata.org/pandas-docs/stable/

In [3]:
# Data Structures
#-----------------

# 01 - SERIES (1-Dimensional labeled homogeneously-typed array)
# Series data structures are value-mutable, but the length of cannot be changed.

# 02 - DATAFRAME (General 2-Dimensional labeled, size-mutable tabular structure with potentially heterogeneously-typed column)
# index (the rows)
# columns

# A DataFrame is similar to a sheet with rows and columns, while a Series is
# similar to a single column of data.

# 1 - SERIES (1-Dimensional labeled homogeneously-typed array)

In [4]:
a = [1, 2, 'AA', np.nan, '2019-07-01', 3]
s = pd.Series(a)

In [5]:
print(s)
type(s)

0             1
1             2
2            AA
3           NaN
4    2019-07-01
5             3
dtype: object


pandas.core.series.Series

In [6]:
s[2]

'AA'

##### > Custom index names

In [7]:
index = ['A', 'B', 'C', 'D', 'date', 'E']
s = pd.Series(a, index)
s.head()

A                1
B                2
C               AA
D              NaN
date    2019-07-01
dtype: object

In [8]:
s['date']

'2019-07-01'

# 2 - DATAFRAME (General 2-Dimensional labeled, size-mutable tabular structure with potentially heterogeneously-typed column)
index (the rows)<br>
columns

## 2.1 Creating a DataFrame with a date time index, labeled columns and random values

##### > 2.1.1 Creating a range of dates

In [9]:
datelist = pd.date_range('20190701', periods = 2, freq = 'D')
print(datelist)
type(datelist)

DatetimeIndex(['2019-07-01', '2019-07-02'], dtype='datetime64[ns]', freq='D')


pandas.core.indexes.datetimes.DatetimeIndex

In [10]:
datelist = pd.date_range(pd.datetime.today(), periods = 2)
print(datelist)

DatetimeIndex(['2019-06-20 11:12:00.480442', '2019-06-21 11:12:00.480442'], dtype='datetime64[ns]', freq='D')


In [11]:
datelist = pd.date_range(pd.datetime.today().strftime('%d-%m-%y'), periods = 10, freq = 'D')
print(datelist)

DatetimeIndex(['2019-06-20', '2019-06-21', '2019-06-22', '2019-06-23',
               '2019-06-24', '2019-06-25', '2019-06-26', '2019-06-27',
               '2019-06-28', '2019-06-29'],
              dtype='datetime64[ns]', freq='D')


##### > DataFrame

In [12]:
df = pd.DataFrame(np.random.randn(10,5), index = datelist, columns = ['A', 'B', 'C', 'D', 'E'])
df

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591
2019-06-22,1.785864,-0.778382,0.683948,-0.428721,0.21679
2019-06-23,0.760946,-1.199143,2.001259,-0.126637,1.177833
2019-06-24,0.101409,-0.001838,-1.30463,-1.543924,0.19972
2019-06-25,-0.551135,0.280843,0.072867,-0.291706,-0.154336
2019-06-26,-0.582221,-0.310365,-0.024177,0.686229,0.603986
2019-06-27,0.544433,0.180265,-1.499154,-0.425769,0.273486
2019-06-28,-0.016795,1.158821,0.363828,-0.342749,1.219931
2019-06-29,1.047124,0.394251,0.691259,0.075901,-0.141932


##### > Exploring the data

In [13]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
E    float64
dtype: object

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10 entries, 2019-06-20 to 2019-06-29
Freq: D
Data columns (total 5 columns):
A    10 non-null float64
B    10 non-null float64
C    10 non-null float64
D    10 non-null float64
E    10 non-null float64
dtypes: float64(5)
memory usage: 480.0 bytes


In [15]:
df.shape

(10, 5)

## 2.2 Viewing Data

In [16]:
df.head()

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591
2019-06-22,1.785864,-0.778382,0.683948,-0.428721,0.21679
2019-06-23,0.760946,-1.199143,2.001259,-0.126637,1.177833
2019-06-24,0.101409,-0.001838,-1.30463,-1.543924,0.19972


In [17]:
df.head(10)

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591
2019-06-22,1.785864,-0.778382,0.683948,-0.428721,0.21679
2019-06-23,0.760946,-1.199143,2.001259,-0.126637,1.177833
2019-06-24,0.101409,-0.001838,-1.30463,-1.543924,0.19972
2019-06-25,-0.551135,0.280843,0.072867,-0.291706,-0.154336
2019-06-26,-0.582221,-0.310365,-0.024177,0.686229,0.603986
2019-06-27,0.544433,0.180265,-1.499154,-0.425769,0.273486
2019-06-28,-0.016795,1.158821,0.363828,-0.342749,1.219931
2019-06-29,1.047124,0.394251,0.691259,0.075901,-0.141932


In [29]:
df.tail()

Unnamed: 0,A,B,C,D,E
2019-06-19,-1.263738,-0.761422,0.74761,-1.358186,1.517285
2019-06-20,0.039657,0.338528,0.755499,1.76856,0.352867
2019-06-21,0.930169,-1.041601,1.965777,0.27834,1.878768
2019-06-22,0.820921,0.021398,-0.919655,-1.057482,-0.443572
2019-06-23,-2.209524,-0.505473,0.139511,-0.599462,-0.387417


##### > Display index (rows) and columns

In [18]:
df.index

DatetimeIndex(['2019-06-20', '2019-06-21', '2019-06-22', '2019-06-23',
               '2019-06-24', '2019-06-25', '2019-06-26', '2019-06-27',
               '2019-06-28', '2019-06-29'],
              dtype='datetime64[ns]', freq='D')

In [19]:
df.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

## 2.3 Selecting Data
  iloc: Purely integer-location based indexing for selection by position <br>
  loc : Purely label-location based indexer for selection by label <br>
  iat : Fast integer location scalar accessor <br>
   at : Access a single value using a label <br>



### 2.3.1 Selecting Columns

#####  > Single column by label (and getting a Serie)

In [20]:
df['A']

2019-06-20   -0.652861
2019-06-21   -0.042220
2019-06-22    1.785864
2019-06-23    0.760946
2019-06-24    0.101409
2019-06-25   -0.551135
2019-06-26   -0.582221
2019-06-27    0.544433
2019-06-28   -0.016795
2019-06-29    1.047124
Freq: D, Name: A, dtype: float64

##### > Single column by label (and getting a DataFrame)

In [31]:
df[['A']]

Unnamed: 0,A
2019-06-14,0.032526
2019-06-15,1.802055
2019-06-16,-0.957432
2019-06-17,-1.812419
2019-06-18,-0.415217
2019-06-19,-1.263738
2019-06-20,0.039657
2019-06-21,0.930169
2019-06-22,0.820921
2019-06-23,-2.209524


##### > Selecting more than one column by label (and getting a DataFrame)

In [21]:
df[['A' ,'B']]

Unnamed: 0,A,B
2019-06-20,-0.652861,-1.037147
2019-06-21,-0.04222,0.352408
2019-06-22,1.785864,-0.778382
2019-06-23,0.760946,-1.199143
2019-06-24,0.101409,-0.001838
2019-06-25,-0.551135,0.280843
2019-06-26,-0.582221,-0.310365
2019-06-27,0.544433,0.180265
2019-06-28,-0.016795,1.158821
2019-06-29,1.047124,0.394251


##### > Selecting a single column by attribute

In [22]:
df.A

2019-06-20   -0.652861
2019-06-21   -0.042220
2019-06-22    1.785864
2019-06-23    0.760946
2019-06-24    0.101409
2019-06-25   -0.551135
2019-06-26   -0.582221
2019-06-27    0.544433
2019-06-28   -0.016795
2019-06-29    1.047124
Freq: D, Name: A, dtype: float64

### 2.3.2 Selecting Rows 

##### > Rows by index

In [23]:
df[0:2]

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591


##### > Rows by label

In [24]:
df['20190620':'20190625']

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591
2019-06-22,1.785864,-0.778382,0.683948,-0.428721,0.21679
2019-06-23,0.760946,-1.199143,2.001259,-0.126637,1.177833
2019-06-24,0.101409,-0.001838,-1.30463,-1.543924,0.19972
2019-06-25,-0.551135,0.280843,0.072867,-0.291706,-0.154336


In [25]:
# NO TENGO CLARO LO QUE HACE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
df[0:5 :5]

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457


### 2.3.3 ILOC. Selecting by  Position (similar to  numpy/python)

##### > Selecting a row by index (and getting a Serie)

In [27]:
df.iloc[0]

A   -0.652861
B   -1.037147
C   -0.210493
D    1.202889
E    1.563457
Name: 2019-06-20 00:00:00, dtype: float64

##### > Selecting a row by index (and getting a DataFrame)

In [28]:
df.iloc[[0]]

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457


##### > Selecting a column by index (and getting a Serie)

In [29]:
df.iloc[:,2]

2019-06-20   -0.210493
2019-06-21   -0.911397
2019-06-22    0.683948
2019-06-23    2.001259
2019-06-24   -1.304630
2019-06-25    0.072867
2019-06-26   -0.024177
2019-06-27   -1.499154
2019-06-28    0.363828
2019-06-29    0.691259
Freq: D, Name: C, dtype: float64

##### > Selecting a column by index (and getting a DataFrame)

In [30]:
df.iloc[:,[2]]

Unnamed: 0,C
2019-06-20,-0.210493
2019-06-21,-0.911397
2019-06-22,0.683948
2019-06-23,2.001259
2019-06-24,-1.30463
2019-06-25,0.072867
2019-06-26,-0.024177
2019-06-27,-1.499154
2019-06-28,0.363828
2019-06-29,0.691259


##### > Selecting columns by index

In [31]:
df.iloc[:, [0, 2, 4]]

Unnamed: 0,A,C,E
2019-06-20,-0.652861,-0.210493,1.563457
2019-06-21,-0.04222,-0.911397,-0.092591
2019-06-22,1.785864,0.683948,0.21679
2019-06-23,0.760946,2.001259,1.177833
2019-06-24,0.101409,-1.30463,0.19972
2019-06-25,-0.551135,0.072867,-0.154336
2019-06-26,-0.582221,-0.024177,0.603986
2019-06-27,0.544433,-1.499154,0.273486
2019-06-28,-0.016795,0.363828,1.219931
2019-06-29,1.047124,0.691259,-0.141932


##### > Selecting rows by index

In [32]:
df.iloc[0:3, :]

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591
2019-06-22,1.785864,-0.778382,0.683948,-0.428721,0.21679


##### > Selecting rows and columns by index (slicing)

In [33]:
df.iloc[0:3, 2:4]

Unnamed: 0,C,D
2019-06-20,-0.210493,1.202889
2019-06-21,-0.911397,0.363411
2019-06-22,0.683948,-0.428721


In [34]:
range_A = np.r_[0:2, -2:0] # numpy.r_ Translates slice objects to concatenation along the first axis.
print(range_A)

[ 0  1 -2 -1]


In [35]:
df.iloc[range_A]

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591
2019-06-28,-0.016795,1.158821,0.363828,-0.342749,1.219931
2019-06-29,1.047124,0.394251,0.691259,0.075901,-0.141932


##### > Selecting rows and columns by index

In [36]:
df.iloc[[0, 2, 3, 5, 7], [2, 4]]

Unnamed: 0,C,E
2019-06-20,-0.210493,1.563457
2019-06-22,0.683948,0.21679
2019-06-23,2.001259,1.177833
2019-06-25,0.072867,-0.154336
2019-06-27,-1.499154,0.273486


### 2.3.4 LOC : Purely label-location based indexer for selection by label

##### > Selecting a column by label (and getting a Serie)

In [37]:
df.loc[:, 'A']

2019-06-20   -0.652861
2019-06-21   -0.042220
2019-06-22    1.785864
2019-06-23    0.760946
2019-06-24    0.101409
2019-06-25   -0.551135
2019-06-26   -0.582221
2019-06-27    0.544433
2019-06-28   -0.016795
2019-06-29    1.047124
Freq: D, Name: A, dtype: float64

##### > Selecting a column by label (and getting a DataFrame)

In [38]:
df.loc[:, ['A']]

Unnamed: 0,A
2019-06-20,-0.652861
2019-06-21,-0.04222
2019-06-22,1.785864
2019-06-23,0.760946
2019-06-24,0.101409
2019-06-25,-0.551135
2019-06-26,-0.582221
2019-06-27,0.544433
2019-06-28,-0.016795
2019-06-29,1.047124


##### > Selecting columns by label

In [39]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2019-06-20,-0.652861,-1.037147
2019-06-21,-0.04222,0.352408
2019-06-22,1.785864,-0.778382
2019-06-23,0.760946,-1.199143
2019-06-24,0.101409,-0.001838
2019-06-25,-0.551135,0.280843
2019-06-26,-0.582221,-0.310365
2019-06-27,0.544433,0.180265
2019-06-28,-0.016795,1.158821
2019-06-29,1.047124,0.394251


##### > Selecting a row by label (and getting a Serie)

In [42]:
df.loc['20190621'] # Remember to modify to current date

A   -0.042220
B    0.352408
C   -0.911397
D    0.363411
E   -0.092591
Name: 2019-06-21 00:00:00, dtype: float64

In [43]:
df.loc['2019-06-21']

A   -0.042220
B    0.352408
C   -0.911397
D    0.363411
E   -0.092591
Name: 2019-06-21 00:00:00, dtype: float64

In [44]:
df.loc[pd.to_datetime('2019-06-21')]

A   -0.042220
B    0.352408
C   -0.911397
D    0.363411
E   -0.092591
Name: 2019-06-21 00:00:00, dtype: float64

##### > Selecting a row by label (and getting a DataFrame)

In [45]:
df.loc[[pd.to_datetime('2019-06-21')]]

Unnamed: 0,A,B,C,D,E
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591


##### > Selecting rows by label

In [47]:
df.loc[[pd.to_datetime('2019-06-21'), pd.to_datetime('2019-06-21')]]

Unnamed: 0,A,B,C,D,E
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591


##### > Selecting rows by label (slicing)

In [48]:
df.loc[pd.to_datetime('2019-06-20'): pd.to_datetime('2019-06-25')]

Unnamed: 0,A,B,C,D,E
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591
2019-06-22,1.785864,-0.778382,0.683948,-0.428721,0.21679
2019-06-23,0.760946,-1.199143,2.001259,-0.126637,1.177833
2019-06-24,0.101409,-0.001838,-1.30463,-1.543924,0.19972
2019-06-25,-0.551135,0.280843,0.072867,-0.291706,-0.154336


#### > Selecting rows and columns by label

In [49]:
df.loc[[pd.to_datetime('2019-06-22'), pd.to_datetime('2019-06-28')], ['A','C']]

Unnamed: 0,A,C
2019-06-22,1.785864,0.683948
2019-06-28,-0.016795,0.363828


##### > Selecting rows and columns by label (slicing)

In [50]:
df.loc[pd.to_datetime('2019-06-20'): pd.to_datetime('2019-06-27'), 'A':'C']

Unnamed: 0,A,B,C
2019-06-20,-0.652861,-1.037147,-0.210493
2019-06-21,-0.04222,0.352408,-0.911397
2019-06-22,1.785864,-0.778382,0.683948
2019-06-23,0.760946,-1.199143,2.001259
2019-06-24,0.101409,-0.001838,-1.30463
2019-06-25,-0.551135,0.280843,0.072867
2019-06-26,-0.582221,-0.310365,-0.024177
2019-06-27,0.544433,0.180265,-1.499154


### 2.3.5 IAT : Fast integer location scalar accessor
AT : Access a single value using a label

##### > Get value at specified row/column pair (index)

In [51]:
df.iat[1,1]

0.3524082112448326

##### > Get value at specified row/column pair (label)

In [52]:
df.at[pd.to_datetime('2019-06-20'),'B']

-1.037147275983069

## 2.4 Adding new records to the data frame

### 2.4.1 Additional columns

In [53]:
df['F'] = 'new'
df

Unnamed: 0,A,B,C,D,E,F
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457,new
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591,new
2019-06-22,1.785864,-0.778382,0.683948,-0.428721,0.21679,new
2019-06-23,0.760946,-1.199143,2.001259,-0.126637,1.177833,new
2019-06-24,0.101409,-0.001838,-1.30463,-1.543924,0.19972,new
2019-06-25,-0.551135,0.280843,0.072867,-0.291706,-0.154336,new
2019-06-26,-0.582221,-0.310365,-0.024177,0.686229,0.603986,new
2019-06-27,0.544433,0.180265,-1.499154,-0.425769,0.273486,new
2019-06-28,-0.016795,1.158821,0.363828,-0.342749,1.219931,new
2019-06-29,1.047124,0.394251,0.691259,0.075901,-0.141932,new


### 2.4.2 Setting values

##### > Entire column

In [54]:
df.iloc[:, 5] = 1
df

Unnamed: 0,A,B,C,D,E,F
2019-06-20,-0.652861,-1.037147,-0.210493,1.202889,1.563457,1
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591,1
2019-06-22,1.785864,-0.778382,0.683948,-0.428721,0.21679,1
2019-06-23,0.760946,-1.199143,2.001259,-0.126637,1.177833,1
2019-06-24,0.101409,-0.001838,-1.30463,-1.543924,0.19972,1
2019-06-25,-0.551135,0.280843,0.072867,-0.291706,-0.154336,1
2019-06-26,-0.582221,-0.310365,-0.024177,0.686229,0.603986,1
2019-06-27,0.544433,0.180265,-1.499154,-0.425769,0.273486,1
2019-06-28,-0.016795,1.158821,0.363828,-0.342749,1.219931,1
2019-06-29,1.047124,0.394251,0.691259,0.075901,-0.141932,1


##### > Entrire row

In [55]:
df.iloc[0:1, :] = 0
df

Unnamed: 0,A,B,C,D,E,F
2019-06-20,0.0,0.0,0.0,0.0,0.0,0
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591,1
2019-06-22,1.785864,-0.778382,0.683948,-0.428721,0.21679,1
2019-06-23,0.760946,-1.199143,2.001259,-0.126637,1.177833,1
2019-06-24,0.101409,-0.001838,-1.30463,-1.543924,0.19972,1
2019-06-25,-0.551135,0.280843,0.072867,-0.291706,-0.154336,1
2019-06-26,-0.582221,-0.310365,-0.024177,0.686229,0.603986,1
2019-06-27,0.544433,0.180265,-1.499154,-0.425769,0.273486,1
2019-06-28,-0.016795,1.158821,0.363828,-0.342749,1.219931,1
2019-06-29,1.047124,0.394251,0.691259,0.075901,-0.141932,1


##### > Single value

In [56]:
df.iloc[2,1] = 13
df

Unnamed: 0,A,B,C,D,E,F
2019-06-20,0.0,0.0,0.0,0.0,0.0,0
2019-06-21,-0.04222,0.352408,-0.911397,0.363411,-0.092591,1
2019-06-22,1.785864,13.0,0.683948,-0.428721,0.21679,1
2019-06-23,0.760946,-1.199143,2.001259,-0.126637,1.177833,1
2019-06-24,0.101409,-0.001838,-1.30463,-1.543924,0.19972,1
2019-06-25,-0.551135,0.280843,0.072867,-0.291706,-0.154336,1
2019-06-26,-0.582221,-0.310365,-0.024177,0.686229,0.603986,1
2019-06-27,0.544433,0.180265,-1.499154,-0.425769,0.273486,1
2019-06-28,-0.016795,1.158821,0.363828,-0.342749,1.219931,1
2019-06-29,1.047124,0.394251,0.691259,0.075901,-0.141932,1
