# Learning the fundamentals of pandas
#### Author: Alphonse Brandon


In [3]:
# Importing libraries and modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<h4>Creating Objects in pandas


Creating a series and passing a list of values

In [4]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns

In [5]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.241879,0.759868,-0.890722,0.620202
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438
2013-01-04,-0.07047,1.574316,-2.608989,-1.81274
2013-01-05,-1.513,1.700117,1.897296,1.648903
2013-01-06,0.754513,-0.582074,0.681169,0.69561


Creating a DataFrame and passing a dict of objects that can be converted into series like

In [7]:
df2 = pd.DataFrame({
    'A': 1,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(4)), dtype = 'float32'),
    'D': np.array([3]* 4, dtype = 'int32'),
    'E': pd.Categorical(['test', 'train', 'test', 'train']),
    'F': 'foo'
    
})

df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1.0,3,test,foo
1,1,2013-01-02,1.0,3,train,foo
2,1,2013-01-02,1.0,3,test,foo
3,1,2013-01-02,1.0,3,train,foo


Creating specific data types for each column of the dataframe

In [8]:
# displaying the current data type

df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### Viewing Data

Seeing the top and bottom of the frame

In [15]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,-0.241879,0.759868,-0.890722,0.620202
2013-01-02,0.145822,0.513743,-0.628044,0.159255


In [16]:
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,-1.513,1.700117,1.897296,1.648903
2013-01-06,0.754513,-0.582074,0.681169,0.69561


Displaying the index, columns and uderlying numpy array

In [17]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [19]:
df.values

array([[-0.241879  ,  0.75986765, -0.89072198,  0.62020239],
       [ 0.14582237,  0.51374285, -0.62804437,  0.15925495],
       [ 2.16232875, -0.09918239, -0.24950794, -1.26843839],
       [-0.07047014,  1.5743156 , -2.60898932, -1.81273986],
       [-1.51300012,  1.70011715,  1.89729598,  1.64890344],
       [ 0.75451321, -0.58207398,  0.68116917,  0.69560976]])

Describe() shows a brief statistical summary of the dataframe

In [21]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.206219,0.644464,-0.2998,0.007132
std,1.213565,0.901869,1.52193,1.30441
min,-1.513,-0.582074,-2.608989,-1.81274
25%,-0.199027,0.054049,-0.825053,-0.911515
50%,0.037676,0.636805,-0.438776,0.389729
75%,0.602341,1.370704,0.4485,0.676758
max,2.162329,1.700117,1.897296,1.648903


Transposing the data

In [22]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.241879,0.145822,2.162329,-0.07047,-1.513,0.754513
B,0.759868,0.513743,-0.099182,1.574316,1.700117,-0.582074
C,-0.890722,-0.628044,-0.249508,-2.608989,1.897296,0.681169
D,0.620202,0.159255,-1.268438,-1.81274,1.648903,0.69561


Sorting my data by an axis

In [23]:
df.sort_values(by = 'B')

Unnamed: 0,A,B,C,D
2013-01-06,0.754513,-0.582074,0.681169,0.69561
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-01,-0.241879,0.759868,-0.890722,0.620202
2013-01-04,-0.07047,1.574316,-2.608989,-1.81274
2013-01-05,-1.513,1.700117,1.897296,1.648903


Selecting and Getting data

In [24]:
df['A']

2013-01-01   -0.241879
2013-01-02    0.145822
2013-01-03    2.162329
2013-01-04   -0.070470
2013-01-05   -1.513000
2013-01-06    0.754513
Freq: D, Name: A, dtype: float64

Selecting a range of rows

In [25]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.241879,0.759868,-0.890722,0.620202
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438


In [26]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438
2013-01-04,-0.07047,1.574316,-2.608989,-1.81274


Selection by label

In [28]:
df.loc[[dates[0]]]

Unnamed: 0,A,B,C,D
2013-01-01,-0.241879,0.759868,-0.890722,0.620202


In [29]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.241879,0.759868
2013-01-02,0.145822,0.513743
2013-01-03,2.162329,-0.099182
2013-01-04,-0.07047,1.574316
2013-01-05,-1.513,1.700117
2013-01-06,0.754513,-0.582074


Label slicing

In [30]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,0.145822,0.513743
2013-01-03,2.162329,-0.099182
2013-01-04,-0.07047,1.574316


Reduction in the dimension of returned objects

In [33]:
df.loc['20130102', ['A', 'B']]

A    0.145822
B    0.513743
Name: 2013-01-02 00:00:00, dtype: float64

Getting a specific value

In [35]:
df.loc[dates[0], 'A']

-0.2418790034500852

A faster method to getting a specific value

In [36]:
df.at[dates[0], 'A']

-0.2418790034500852