# Introduction to Pandas package

In [1]:
# Import libraries needed for this turorial
import numpy as np
import pandas as pd

In [2]:
# pandas.Series() is a a one-dimensional ndarray. ie a list
s = pd.Series([1, 2 , 4 ,8, np.nan, 32])

In [3]:
s

0     1.0
1     2.0
2     4.0
3     8.0
4     NaN
5    32.0
dtype: float64

In [4]:
# pandas.DataFrame is a two-dimensional tabular data structure ie Rows and Columns
df = pd.DataFrame(np.random.randn(6, 4), columns=list('ABCD'))

In [5]:
df

Unnamed: 0,A,B,C,D
0,0.103623,1.077296,-0.590727,-3.059069
1,0.126595,0.062638,-1.628926,-0.540306
2,-0.558176,0.588168,0.67859,-0.511168
3,0.486656,-0.724928,-1.360152,0.969441
4,1.716141,-0.833268,-1.747677,-0.665409
5,0.206174,0.651322,-0.138699,-0.476352


## Viewing data

In [6]:
# With a dataframe, you can veiw the first x rows or last x rows using "head" and "tail"
df.head(3)

Unnamed: 0,A,B,C,D
0,0.103623,1.077296,-0.590727,-3.059069
1,0.126595,0.062638,-1.628926,-0.540306
2,-0.558176,0.588168,0.67859,-0.511168


In [7]:
df.tail(2)

Unnamed: 0,A,B,C,D
4,1.716141,-0.833268,-1.747677,-0.665409
5,0.206174,0.651322,-0.138699,-0.476352


In [8]:
# You can view the index column
df.index

RangeIndex(start=0, stop=6, step=1)

In [9]:
# You can view the column names
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

### Note: A fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column.

In [10]:
# You can print out a summary of your data using the "describe" function in Pandas
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.346835,0.136871,-0.797932,-0.713811
std,0.753795,0.780014,0.955646,1.301035
min,-0.558176,-0.833268,-1.747677,-3.059069
25%,0.109366,-0.528037,-1.561733,-0.634134
50%,0.166384,0.325403,-0.97544,-0.525737
75%,0.416535,0.635533,-0.251706,-0.485056
max,1.716141,1.077296,0.67859,0.969441


In [11]:
# You can Transpose your data (ie switch rows with columns) by using the "Transpose" function
df.T

Unnamed: 0,0,1,2,3,4,5
A,0.103623,0.126595,-0.558176,0.486656,1.716141,0.206174
B,1.077296,0.062638,0.588168,-0.724928,-0.833268,0.651322
C,-0.590727,-1.628926,0.67859,-1.360152,-1.747677,-0.138699
D,-3.059069,-0.540306,-0.511168,0.969441,-0.665409,-0.476352


In [12]:
# You can sort an dataframe by index. 
# Below, axis=0 means sort the rows, axis=1 means sort the columns.
# Below, ascending=True means arrange in ascending order, ascending=Fase means arrange in descending order
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
5,0.206174,0.651322,-0.138699,-0.476352
4,1.716141,-0.833268,-1.747677,-0.665409
3,0.486656,-0.724928,-1.360152,0.969441
2,-0.558176,0.588168,0.67859,-0.511168
1,0.126595,0.062638,-1.628926,-0.540306
0,0.103623,1.077296,-0.590727,-3.059069


In [13]:
# As well as sorting by index, you can sort a dataframe by value
df.sort_values(by="C", ascending=False)

Unnamed: 0,A,B,C,D
2,-0.558176,0.588168,0.67859,-0.511168
5,0.206174,0.651322,-0.138699,-0.476352
0,0.103623,1.077296,-0.590727,-3.059069
3,0.486656,-0.724928,-1.360152,0.969441
1,0.126595,0.062638,-1.628926,-0.540306
4,1.716141,-0.833268,-1.747677,-0.665409


## Selecting data

In [14]:
# You can select a single column
df['B']

0    1.077296
1    0.062638
2    0.588168
3   -0.724928
4   -0.833268
5    0.651322
Name: B, dtype: float64

In [15]:
# You can select certain rows
df[0:2]
# Note how 0:2 refers to rows 0 and 1. ie 2 means up to, but not including 2 (3rd row)

Unnamed: 0,A,B,C,D
0,0.103623,1.077296,-0.590727,-3.059069
1,0.126595,0.062638,-1.628926,-0.540306


### Selecting by label - ".loc" function

In [16]:
# You can use the ".loc" function to locate a specific row
df.loc[3]
# Remember this is the 4th row, because the 1st row is row 0

A    0.486656
B   -0.724928
C   -1.360152
D    0.969441
Name: 3, dtype: float64

In [17]:
# You can use the ".loc" function to locate a specific column
df.loc[:, ['A']]

Unnamed: 0,A
0,0.103623
1,0.126595
2,-0.558176
3,0.486656
4,1.716141
5,0.206174


In [18]:
# You can use the ".loc" function to locate any number of rows or colums
df.loc[2:4, ['A','C']]

Unnamed: 0,A,C
2,-0.558176,0.67859
3,0.486656,-1.360152
4,1.716141,-1.747677


### Selecting by position - ".iloc" function

In [19]:
# You can also select on the basis on position in the dataframe
# Note: this yelds the same as df.loc[3] above
df.iloc[3]

A    0.486656
B   -0.724928
C   -1.360152
D    0.969441
Name: 3, dtype: float64

In [20]:
# You can select a certain column
# Note: This doesn't exactly yield the same result as df.loc[:, ['A']] above. The Heading is not included
df.iloc[ : , 0]

0    0.103623
1    0.126595
2   -0.558176
3    0.486656
4    1.716141
5    0.206174
Name: A, dtype: float64

In [21]:
# You can also select specific parts of the dataframe
# This code does not yiled ths same as "df.loc[2:4, ['A','C']]" above
df.iloc[2:4, 0:2]

Unnamed: 0,A,B
2,-0.558176,0.588168
3,0.486656,-0.724928


In [22]:
# But the below code DOES yield the same result as "df.loc[2:4, ['A','C']]" above
df.iloc[[2,3,4],[0,2]]

Unnamed: 0,A,C
2,-0.558176,0.67859
3,0.486656,-1.360152
4,1.716141,-1.747677


In [23]:
# For slicing rows explicitly
df.iloc[1:3,:]
# Note how the rows are up to, but not including the row number stated.

Unnamed: 0,A,B,C,D
1,0.126595,0.062638,-1.628926,-0.540306
2,-0.558176,0.588168,0.67859,-0.511168


In [24]:
# For slicing columns explicitly
df.iloc[:,1:3]

Unnamed: 0,B,C
0,1.077296,-0.590727
1,0.062638,-1.628926
2,0.588168,0.67859
3,-0.724928,-1.360152
4,-0.833268,-1.747677
5,0.651322,-0.138699


In [25]:
# For getting a value explicity
df.iloc[0,0]

0.10362258732962361

### Note: Above, it id difficult to see the difference between ".loc" and ".iloc" functions. This is because the default index is 0, 1, 2,3,...... If the index was different (eg hours of a day, where the same nuber may be reappearing later in the dataframe index.
### .loc is used when referencing the index in the dataframe
### .iloc is used when referencing the row number in the dataframe

In [26]:
# If you wish to select certain values from a database. For instance, below will select rows from the database where the
# value in Column A is greter then zero
df[df.A > 0]

Unnamed: 0,A,B,C,D
0,0.103623,1.077296,-0.590727,-3.059069
1,0.126595,0.062638,-1.628926,-0.540306
3,0.486656,-0.724928,-1.360152,0.969441
4,1.716141,-0.833268,-1.747677,-0.665409
5,0.206174,0.651322,-0.138699,-0.476352


In [27]:
# If you wish to select only data that meets a certain criteria. For instance, below displays only values that are greater
# than zero
df[df > 0]

Unnamed: 0,A,B,C,D
0,0.103623,1.077296,,
1,0.126595,0.062638,,
2,,0.588168,0.67859,
3,0.486656,,,0.969441
4,1.716141,,,
5,0.206174,0.651322,,


In [28]:
# You can copy a DataFrame
df2 = df.copy()

In [29]:
df2

Unnamed: 0,A,B,C,D
0,0.103623,1.077296,-0.590727,-3.059069
1,0.126595,0.062638,-1.628926,-0.540306
2,-0.558176,0.588168,0.67859,-0.511168
3,0.486656,-0.724928,-1.360152,0.969441
4,1.716141,-0.833268,-1.747677,-0.665409
5,0.206174,0.651322,-0.138699,-0.476352


In [30]:
# You can enter a new colum into a dataframe
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [31]:
df2

Unnamed: 0,A,B,C,D,E
0,0.103623,1.077296,-0.590727,-3.059069,one
1,0.126595,0.062638,-1.628926,-0.540306,one
2,-0.558176,0.588168,0.67859,-0.511168,two
3,0.486656,-0.724928,-1.360152,0.969441,three
4,1.716141,-0.833268,-1.747677,-0.665409,four
5,0.206174,0.651322,-0.138699,-0.476352,three


In [32]:
# Using the "isin" function, you can select certain data
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2,-0.558176,0.588168,0.67859,-0.511168,two
4,1.716141,-0.833268,-1.747677,-0.665409,four


### Setting

In [33]:
# Just to remind ourselves of the dataframe we are using
df

Unnamed: 0,A,B,C,D
0,0.103623,1.077296,-0.590727,-3.059069
1,0.126595,0.062638,-1.628926,-0.540306
2,-0.558176,0.588168,0.67859,-0.511168
3,0.486656,-0.724928,-1.360152,0.969441
4,1.716141,-0.833268,-1.747677,-0.665409
5,0.206174,0.651322,-0.138699,-0.476352


In [34]:
# You can set a value in a particular location of a database.
df.at[0,'A'] = 0

In [35]:
df

Unnamed: 0,A,B,C,D
0,0.0,1.077296,-0.590727,-3.059069
1,0.126595,0.062638,-1.628926,-0.540306
2,-0.558176,0.588168,0.67859,-0.511168
3,0.486656,-0.724928,-1.360152,0.969441
4,1.716141,-0.833268,-1.747677,-0.665409
5,0.206174,0.651322,-0.138699,-0.476352


In [36]:
# You can alos set a value using the "iat" command
df.iat[1,1] = 0

In [37]:
df

Unnamed: 0,A,B,C,D
0,0.0,1.077296,-0.590727,-3.059069
1,0.126595,0.0,-1.628926,-0.540306
2,-0.558176,0.588168,0.67859,-0.511168
3,0.486656,-0.724928,-1.360152,0.969441
4,1.716141,-0.833268,-1.747677,-0.665409
5,0.206174,0.651322,-0.138699,-0.476352


In [38]:
# Finally, you can set an entire column to be a ceratin value
df.loc[:, 'D'] = np.array([5] * len(df))

In [39]:
df

Unnamed: 0,A,B,C,D
0,0.0,1.077296,-0.590727,5
1,0.126595,0.0,-1.628926,5
2,-0.558176,0.588168,0.67859,5
3,0.486656,-0.724928,-1.360152,5
4,1.716141,-0.833268,-1.747677,5
5,0.206174,0.651322,-0.138699,5


### Missing data

#### pandas primarily uses the value 'np.nan' to represent missing data. It is by default not included in computations. Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data.

In [40]:
# A new column can be inserted
df1 = df.reindex([0,1,2,3,4,5], columns=list(df.columns) + ['E'])

In [41]:
df1

Unnamed: 0,A,B,C,D,E
0,0.0,1.077296,-0.590727,5,
1,0.126595,0.0,-1.628926,5,
2,-0.558176,0.588168,0.67859,5,
3,0.486656,-0.724928,-1.360152,5,
4,1.716141,-0.833268,-1.747677,5,
5,0.206174,0.651322,-0.138699,5,


In [42]:
# You can set any value(s)  in the dataframe
df1.loc[2:4, 'E'] = 1

In [43]:
df1

Unnamed: 0,A,B,C,D,E
0,0.0,1.077296,-0.590727,5,
1,0.126595,0.0,-1.628926,5,
2,-0.558176,0.588168,0.67859,5,1.0
3,0.486656,-0.724928,-1.360152,5,1.0
4,1.716141,-0.833268,-1.747677,5,1.0
5,0.206174,0.651322,-0.138699,5,


In [44]:
# You can omit any none-values from the dataframe
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2,-0.558176,0.588168,0.67859,5,1.0
3,0.486656,-0.724928,-1.360152,5,1.0
4,1.716141,-0.833268,-1.747677,5,1.0


In [45]:
# Or you can fill in missing data in a datframe
df1.fillna(value=10)

Unnamed: 0,A,B,C,D,E
0,0.0,1.077296,-0.590727,5,10.0
1,0.126595,0.0,-1.628926,5,10.0
2,-0.558176,0.588168,0.67859,5,1.0
3,0.486656,-0.724928,-1.360152,5,1.0
4,1.716141,-0.833268,-1.747677,5,1.0
5,0.206174,0.651322,-0.138699,5,10.0


In [46]:
# The 'isna' function gives a boolean check on whether a valu is NaN or not
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
0,False,False,False,False,True
1,False,False,False,False,True
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,True


### Operations

#### Stats

In [47]:
df

Unnamed: 0,A,B,C,D
0,0.0,1.077296,-0.590727,5
1,0.126595,0.0,-1.628926,5
2,-0.558176,0.588168,0.67859,5
3,0.486656,-0.724928,-1.360152,5
4,1.716141,-0.833268,-1.747677,5
5,0.206174,0.651322,-0.138699,5


In [48]:
# You can get the average on each column
df.mean()

A    0.329565
B    0.126432
C   -0.797932
D    5.000000
dtype: float64

In [49]:
# You can get the average of each row
df.mean(1)

0    1.371642
1    0.874417
2    1.427145
3    0.850394
4    1.033799
5    1.429699
dtype: float64

In [50]:
# cumsum allows you to add the values in a column as you move down through it
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
0,0.0,1.077296,-0.590727,5
1,0.126595,1.077296,-2.219653,10
2,-0.431581,1.665464,-1.541064,15
3,0.055075,0.940536,-2.901216,20
4,1.771216,0.107268,-4.648893,25
5,1.97739,0.75859,-4.787592,30


In [51]:
# You can apply an equation to the dataset. For instance, below prints out the max value in a column minus the min value
df.apply(lambda x: x.max() - x.min())

A    2.274317
B    1.910564
C    2.426267
D    0.000000
dtype: float64

### Merge

#### concat()

In [54]:
# Create a dataframe from numpy.random
df = pd.DataFrame(np.random.randn(10,4))

In [55]:
df

Unnamed: 0,0,1,2,3
0,-0.49282,1.436635,-2.401764,-1.614846
1,0.109048,0.698383,0.058691,1.694592
2,-1.911995,1.050428,0.151916,0.201511
3,0.008523,0.797071,0.892956,0.617316
4,0.348118,-0.528351,1.098926,0.116794
5,0.363109,0.019268,-0.94279,-0.482386
6,-0.614077,-1.409872,-0.651879,1.767626
7,1.08653,0.103645,-0.173701,-1.431629
8,-0.255574,0.215049,1.819264,-1.116394
9,-0.009336,1.203405,0.728296,-1.576823


In [60]:
# You can pick specific pats of this DataFrame
pieces = [df[:3], df[3:7], df[7:]]

In [59]:
pieces

[          0         1         2         3
 0 -0.492820  1.436635 -2.401764 -1.614846
 1  0.109048  0.698383  0.058691  1.694592
 2 -1.911995  1.050428  0.151916  0.201511,
           0         1         2         3
 3  0.008523  0.797071  0.892956  0.617316
 4  0.348118 -0.528351  1.098926  0.116794
 5  0.363109  0.019268 -0.942790 -0.482386
 6 -0.614077 -1.409872 -0.651879  1.767626,
           0         1         2         3
 7  1.086530  0.103645 -0.173701 -1.431629
 8 -0.255574  0.215049  1.819264 -1.116394
 9 -0.009336  1.203405  0.728296 -1.576823]

In [62]:
# the "concat" function in pandas allows you to join/merge a dataFrame
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.49282,1.436635,-2.401764,-1.614846
1,0.109048,0.698383,0.058691,1.694592
2,-1.911995,1.050428,0.151916,0.201511
3,0.008523,0.797071,0.892956,0.617316
4,0.348118,-0.528351,1.098926,0.116794
5,0.363109,0.019268,-0.94279,-0.482386
6,-0.614077,-1.409872,-0.651879,1.767626
7,1.08653,0.103645,-0.173701,-1.431629
8,-0.255574,0.215049,1.819264,-1.116394
9,-0.009336,1.203405,0.728296,-1.576823


In [68]:
# You can concatenate two or more DataFrames
df2 = pd.DataFrame(np.random.randn(20,8))

In [69]:
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.912343,-0.058825,-1.256894,-1.676373,-0.392774,-1.132346,1.129554,-0.208444
1,1.005921,0.56623,1.140029,-0.532098,0.352598,-0.567983,1.382321,-0.191577
2,-0.42009,1.094139,-0.317719,0.039238,0.764773,-0.6904,-1.019346,-0.427917
3,0.450233,0.242288,0.629266,0.337409,0.996211,0.645888,-0.540282,0.20383
4,-1.439075,-0.071255,0.283483,-1.032067,-0.420794,0.546382,-0.069244,0.40741
5,-0.694635,-0.592713,2.494028,-0.538829,0.004508,-0.227637,-0.520794,1.926202
6,-0.442951,-0.042737,0.751142,-1.338956,-0.268114,1.252285,1.040157,0.315471
7,-0.096644,-2.005204,1.596848,0.838173,-0.49849,1.320358,0.309861,-0.978589
8,-1.481187,-0.005791,0.225199,1.961027,1.020848,-0.532107,-0.004731,-0.885057
9,-0.320642,1.729426,-0.017843,-2.090861,-0.904757,-0.759657,0.158135,-0.597621


In [70]:
# Create an entity called Frames which will contain both df and df1
Frames = [df, df2]

In [71]:
# Now, concatenate Frames
df3 = pd.concat(Frames)

In [72]:
df3

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.49282,1.436635,-2.401764,-1.614846,,,,
1,0.109048,0.698383,0.058691,1.694592,,,,
2,-1.911995,1.050428,0.151916,0.201511,,,,
3,0.008523,0.797071,0.892956,0.617316,,,,
4,0.348118,-0.528351,1.098926,0.116794,,,,
5,0.363109,0.019268,-0.94279,-0.482386,,,,
6,-0.614077,-1.409872,-0.651879,1.767626,,,,
7,1.08653,0.103645,-0.173701,-1.431629,,,,
8,-0.255574,0.215049,1.819264,-1.116394,,,,
9,-0.009336,1.203405,0.728296,-1.576823,,,,


#### Join

In [82]:
pd.merge(df, df2, on=3)

Unnamed: 0,0_x,1_x,2_x,3,0_y,1_y,2_y,4,5,6,7


#### Append

In [83]:
# Rows can be appended to a DataFrame
df

Unnamed: 0,0,1,2,3
0,-0.49282,1.436635,-2.401764,-1.614846
1,0.109048,0.698383,0.058691,1.694592
2,-1.911995,1.050428,0.151916,0.201511
3,0.008523,0.797071,0.892956,0.617316
4,0.348118,-0.528351,1.098926,0.116794
5,0.363109,0.019268,-0.94279,-0.482386
6,-0.614077,-1.409872,-0.651879,1.767626
7,1.08653,0.103645,-0.173701,-1.431629
8,-0.255574,0.215049,1.819264,-1.116394
9,-0.009336,1.203405,0.728296,-1.576823


In [84]:
s = df.iloc[3]

In [85]:
s

0    0.008523
1    0.797071
2    0.892956
3    0.617316
Name: 3, dtype: float64

In [87]:
df.append(s, ignore_index=True)

Unnamed: 0,0,1,2,3
0,-0.49282,1.436635,-2.401764,-1.614846
1,0.109048,0.698383,0.058691,1.694592
2,-1.911995,1.050428,0.151916,0.201511
3,0.008523,0.797071,0.892956,0.617316
4,0.348118,-0.528351,1.098926,0.116794
5,0.363109,0.019268,-0.94279,-0.482386
6,-0.614077,-1.409872,-0.651879,1.767626
7,1.08653,0.103645,-0.173701,-1.431629
8,-0.255574,0.215049,1.819264,-1.116394
9,-0.009336,1.203405,0.728296,-1.576823
