# Introduction to Pandas package

In [1]:
# Import libraries needed for this turorial
import numpy as np
import pandas as pd

In [2]:
# pandas.Series() is a a one-dimensional ndarray. ie a list
s = pd.Series([1, 2 , 4 ,8, np.nan, 32])

In [3]:
s

0     1.0
1     2.0
2     4.0
3     8.0
4     NaN
5    32.0
dtype: float64

In [4]:
# pandas.DataFrame is a two-dimensional tabular data structure ie Rows and Columns
df = pd.DataFrame(np.random.randn(6, 4), columns=list('ABCD'))

In [5]:
df

Unnamed: 0,A,B,C,D
0,-1.943659,0.589405,-0.433678,0.81322
1,-1.176791,-1.066639,0.417268,-1.044854
2,0.058122,-2.746821,0.457432,-0.111426
3,-0.763137,-0.75501,-1.131524,-0.099653
4,-0.534479,-1.498721,-2.029865,-1.028716
5,-0.157701,0.072999,1.63604,-0.240512


## Viewing data

In [6]:
# With a dataframe, you can veiw the first x rows or last x rows using "head" and "tail"
df.head(3)

Unnamed: 0,A,B,C,D
0,-1.943659,0.589405,-0.433678,0.81322
1,-1.176791,-1.066639,0.417268,-1.044854
2,0.058122,-2.746821,0.457432,-0.111426


In [7]:
df.tail(2)

Unnamed: 0,A,B,C,D
4,-0.534479,-1.498721,-2.029865,-1.028716
5,-0.157701,0.072999,1.63604,-0.240512


In [8]:
# You can view the index column
df.index

RangeIndex(start=0, stop=6, step=1)

In [9]:
# You can view the column names
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

### Note: A fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column.

In [10]:
# You can print out a summary of your data using the "describe" function in Pandas
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.752941,-0.900798,-0.180721,-0.285323
std,0.72903,1.181783,1.300215,0.693259
min,-1.943659,-2.746821,-2.029865,-1.044854
25%,-1.073377,-1.390701,-0.957063,-0.831665
50%,-0.648808,-0.910825,-0.008205,-0.175969
75%,-0.251895,-0.134004,0.447391,-0.102596
max,0.058122,0.589405,1.63604,0.81322


In [11]:
# You can Transpose your data (ie switch rows with columns) by using the "Transpose" function
df.T

Unnamed: 0,0,1,2,3,4,5
A,-1.943659,-1.176791,0.058122,-0.763137,-0.534479,-0.157701
B,0.589405,-1.066639,-2.746821,-0.75501,-1.498721,0.072999
C,-0.433678,0.417268,0.457432,-1.131524,-2.029865,1.63604
D,0.81322,-1.044854,-0.111426,-0.099653,-1.028716,-0.240512


In [12]:
# You can sort an dataframe by index. 
# Below, axis=0 means sort the rows, axis=1 means sort the columns.
# Below, ascending=True means arrange in ascending order, ascending=Fase means arrange in descending order
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
5,-0.157701,0.072999,1.63604,-0.240512
4,-0.534479,-1.498721,-2.029865,-1.028716
3,-0.763137,-0.75501,-1.131524,-0.099653
2,0.058122,-2.746821,0.457432,-0.111426
1,-1.176791,-1.066639,0.417268,-1.044854
0,-1.943659,0.589405,-0.433678,0.81322


In [13]:
# As well as sorting by index, you can sort a dataframe by value
df.sort_values(by="C", ascending=False)

Unnamed: 0,A,B,C,D
5,-0.157701,0.072999,1.63604,-0.240512
2,0.058122,-2.746821,0.457432,-0.111426
1,-1.176791,-1.066639,0.417268,-1.044854
0,-1.943659,0.589405,-0.433678,0.81322
3,-0.763137,-0.75501,-1.131524,-0.099653
4,-0.534479,-1.498721,-2.029865,-1.028716


## Selecting data

In [14]:
# You can select a single column
df['B']

0    0.589405
1   -1.066639
2   -2.746821
3   -0.755010
4   -1.498721
5    0.072999
Name: B, dtype: float64

In [15]:
# You can select certain rows
df[0:2]
# Note how 0:2 refers to rows 0 and 1. ie 2 means up to, but not including 2 (3rd row)

Unnamed: 0,A,B,C,D
0,-1.943659,0.589405,-0.433678,0.81322
1,-1.176791,-1.066639,0.417268,-1.044854


### Selecting by label - ".loc" function

In [16]:
# You can use the ".loc" function to locate a specific row
df.loc[3]
# Remember this is the 4th row, because the 1st row is row 0

A   -0.763137
B   -0.755010
C   -1.131524
D   -0.099653
Name: 3, dtype: float64

In [17]:
# You can use the ".loc" function to locate a specific column
df.loc[:, ['A']]

Unnamed: 0,A
0,-1.943659
1,-1.176791
2,0.058122
3,-0.763137
4,-0.534479
5,-0.157701


In [18]:
# You can use the ".loc" function to locate any number of rows or colums
df.loc[2:4, ['A','C']]

Unnamed: 0,A,C
2,0.058122,0.457432
3,-0.763137,-1.131524
4,-0.534479,-2.029865


### Selecting by position - ".iloc" function

In [19]:
# You can also select on the basis on position in the dataframe
# Note: this yelds the same as df.loc[3] above
df.iloc[3]

A   -0.763137
B   -0.755010
C   -1.131524
D   -0.099653
Name: 3, dtype: float64

In [20]:
# You can select a certain column
# Note: This doesn't exactly yield the same result as df.loc[:, ['A']] above. The Heading is not included
df.iloc[ : , 0]

0   -1.943659
1   -1.176791
2    0.058122
3   -0.763137
4   -0.534479
5   -0.157701
Name: A, dtype: float64

In [21]:
# You can also select specific parts of the dataframe
# This code does not yiled ths same as "df.loc[2:4, ['A','C']]" above
df.iloc[2:4, 0:2]

Unnamed: 0,A,B
2,0.058122,-2.746821
3,-0.763137,-0.75501


In [22]:
# But the below code DOES yield the same result as "df.loc[2:4, ['A','C']]" above
df.iloc[[2,3,4],[0,2]]

Unnamed: 0,A,C
2,0.058122,0.457432
3,-0.763137,-1.131524
4,-0.534479,-2.029865


In [23]:
# For slicing rows explicitly
df.iloc[1:3,:]
# Note how the rows are up to, but not including the row number stated.

Unnamed: 0,A,B,C,D
1,-1.176791,-1.066639,0.417268,-1.044854
2,0.058122,-2.746821,0.457432,-0.111426


In [24]:
# For slicing columns explicitly
df.iloc[:,1:3]

Unnamed: 0,B,C
0,0.589405,-0.433678
1,-1.066639,0.417268
2,-2.746821,0.457432
3,-0.75501,-1.131524
4,-1.498721,-2.029865
5,0.072999,1.63604


In [25]:
# For getting a value explicity
df.iloc[0,0]

-1.9436589402557858

### Note: Above, it id difficult to see the difference between ".loc" and ".iloc" functions. This is because the default index is 0, 1, 2,3,...... If the index was different (eg hours of a day, where the same nuber may be reappearing later in the dataframe index.
### .loc is used when referencing the index in the dataframe
### .iloc is used when referencing the row number in the dataframe

In [26]:
# If you wish to select certain values from a database. For instance, below will select rows from the database where the
# value in Column A is greter then zero
df[df.A > 0]

Unnamed: 0,A,B,C,D
2,0.058122,-2.746821,0.457432,-0.111426


In [27]:
# If you wish to select only data that meets a certain criteria. For instance, below displays only values that are greater
# than zero
df[df > 0]

Unnamed: 0,A,B,C,D
0,,0.589405,,0.81322
1,,,0.417268,
2,0.058122,,0.457432,
3,,,,
4,,,,
5,,0.072999,1.63604,


In [28]:
# You can copy a DataFrame
df2 = df.copy()

In [29]:
df2

Unnamed: 0,A,B,C,D
0,-1.943659,0.589405,-0.433678,0.81322
1,-1.176791,-1.066639,0.417268,-1.044854
2,0.058122,-2.746821,0.457432,-0.111426
3,-0.763137,-0.75501,-1.131524,-0.099653
4,-0.534479,-1.498721,-2.029865,-1.028716
5,-0.157701,0.072999,1.63604,-0.240512


In [30]:
# You can enter a new colum into a dataframe
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [31]:
df2

Unnamed: 0,A,B,C,D,E
0,-1.943659,0.589405,-0.433678,0.81322,one
1,-1.176791,-1.066639,0.417268,-1.044854,one
2,0.058122,-2.746821,0.457432,-0.111426,two
3,-0.763137,-0.75501,-1.131524,-0.099653,three
4,-0.534479,-1.498721,-2.029865,-1.028716,four
5,-0.157701,0.072999,1.63604,-0.240512,three


In [32]:
# Using the "isin" function, you can select certain data
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2,0.058122,-2.746821,0.457432,-0.111426,two
4,-0.534479,-1.498721,-2.029865,-1.028716,four


### Setting

In [33]:
# Just to remind ourselves of the dataframe we are using
df

Unnamed: 0,A,B,C,D
0,-1.943659,0.589405,-0.433678,0.81322
1,-1.176791,-1.066639,0.417268,-1.044854
2,0.058122,-2.746821,0.457432,-0.111426
3,-0.763137,-0.75501,-1.131524,-0.099653
4,-0.534479,-1.498721,-2.029865,-1.028716
5,-0.157701,0.072999,1.63604,-0.240512


In [35]:
# You can set a value in a particular location of a database.
df.at[0,'A'] = 0

In [36]:
df

Unnamed: 0,A,B,C,D
0,0.0,0.589405,-0.433678,0.81322
1,-1.176791,-1.066639,0.417268,-1.044854
2,0.058122,-2.746821,0.457432,-0.111426
3,-0.763137,-0.75501,-1.131524,-0.099653
4,-0.534479,-1.498721,-2.029865,-1.028716
5,-0.157701,0.072999,1.63604,-0.240512


In [37]:
# You can alos set a value using the "iat" command
df.iat[1,1] = 0

In [38]:
df

Unnamed: 0,A,B,C,D
0,0.0,0.589405,-0.433678,0.81322
1,-1.176791,0.0,0.417268,-1.044854
2,0.058122,-2.746821,0.457432,-0.111426
3,-0.763137,-0.75501,-1.131524,-0.099653
4,-0.534479,-1.498721,-2.029865,-1.028716
5,-0.157701,0.072999,1.63604,-0.240512


In [39]:
# Finally, you can set an entire column to be a ceratin value
df.loc[:, 'D'] = np.array([5] * len(df))

In [40]:
df

Unnamed: 0,A,B,C,D
0,0.0,0.589405,-0.433678,5
1,-1.176791,0.0,0.417268,5
2,0.058122,-2.746821,0.457432,5
3,-0.763137,-0.75501,-1.131524,5
4,-0.534479,-1.498721,-2.029865,5
5,-0.157701,0.072999,1.63604,5


### Missing data

#### pandas primarily uses the value 'np.nan' to represent missing data. It is by default not included in computations. Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data.

In [49]:
# A new column can be inserted
df1 = df.reindex([0,1,2,3,4,5], columns=list(df.columns) + ['E'])

In [50]:
df1

Unnamed: 0,A,B,C,D,E
0,0.0,0.589405,-0.433678,5,
1,-1.176791,0.0,0.417268,5,
2,0.058122,-2.746821,0.457432,5,
3,-0.763137,-0.75501,-1.131524,5,
4,-0.534479,-1.498721,-2.029865,5,
5,-0.157701,0.072999,1.63604,5,


In [53]:
# You can set any value(s)  in the dataframe
df1.loc[2:4, 'E'] = 1

In [54]:
df1

Unnamed: 0,A,B,C,D,E
0,0.0,0.589405,-0.433678,5,
1,-1.176791,0.0,0.417268,5,
2,0.058122,-2.746821,0.457432,5,1.0
3,-0.763137,-0.75501,-1.131524,5,1.0
4,-0.534479,-1.498721,-2.029865,5,1.0
5,-0.157701,0.072999,1.63604,5,


In [56]:
# You can omit any none-values from the dataframe
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2,0.058122,-2.746821,0.457432,5,1.0
3,-0.763137,-0.75501,-1.131524,5,1.0
4,-0.534479,-1.498721,-2.029865,5,1.0


In [57]:
# Or you can fill in missing data in a datframe
df1.fillna(value=10)

Unnamed: 0,A,B,C,D,E
0,0.0,0.589405,-0.433678,5,10.0
1,-1.176791,0.0,0.417268,5,10.0
2,0.058122,-2.746821,0.457432,5,1.0
3,-0.763137,-0.75501,-1.131524,5,1.0
4,-0.534479,-1.498721,-2.029865,5,1.0
5,-0.157701,0.072999,1.63604,5,10.0


In [58]:
# The 'isna' function gives a boolean check on whether a valu is NaN or not
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
0,False,False,False,False,True
1,False,False,False,False,True
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,True


### Operations

#### Stats

In [59]:
df

Unnamed: 0,A,B,C,D
0,0.0,0.589405,-0.433678,5
1,-1.176791,0.0,0.417268,5
2,0.058122,-2.746821,0.457432,5
3,-0.763137,-0.75501,-1.131524,5
4,-0.534479,-1.498721,-2.029865,5
5,-0.157701,0.072999,1.63604,5


In [62]:
# You can get the average on each column
df.mean()

A   -0.428998
B   -0.723025
C   -0.180721
D    5.000000
dtype: float64

In [63]:
# You can get the average of each row
df.mean(1)

0    1.288932
1    1.060119
2    0.692183
3    0.587582
4    0.234234
5    1.637834
dtype: float64

In [65]:
# cumsum allows you to add the values in a column as you move down through it
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
0,0.0,0.589405,-0.433678,5
1,-1.176791,0.589405,-0.01641,10
2,-1.118669,-2.157416,0.441022,15
3,-1.881806,-2.912426,-0.690502,20
4,-2.416285,-4.411147,-2.720367,25
5,-2.573986,-4.338149,-1.084327,30


In [66]:
# You can apply an equation to the dataset. For instance, below prints out the max value in a column minus the min value
df.apply(lambda x: x.max() - x.min())

A    1.234913
B    3.336227
C    3.665905
D    0.000000
dtype: float64