# Introduction to Pandas package

In [1]:
# Import libraries needed for this turorial
import numpy as np
import pandas as pd

In [4]:
# pandas.Series() is a a one-dimensional ndarray. ie a list
s = pd.Series([1, 2 , 4 ,8, np.nan, 32])

In [5]:
s

0     1.0
1     2.0
2     4.0
3     8.0
4     NaN
5    32.0
dtype: float64

In [7]:
# pandas.DataFrame is a two-dimensional tabular data structure ie Rows and Columns
df = pd.DataFrame(np.random.randn(6, 4), columns=list('ABCD'))

In [8]:
df

Unnamed: 0,A,B,C,D
0,-0.151773,-1.457541,-1.68259,-0.371826
1,2.020783,-0.744749,-0.494077,-0.0973
2,-1.507947,-0.349995,-0.03951,2.110094
3,-0.965005,-0.562729,-0.836013,-0.470055
4,0.122866,-0.357009,-0.109226,-0.964451
5,1.376951,-0.636536,0.528513,0.427173


## Viewing data

In [9]:
# With a dataframe, you can veiw the first x rows or last x rows using "head" and "tail"
df.head(3)

Unnamed: 0,A,B,C,D
0,-0.151773,-1.457541,-1.68259,-0.371826
1,2.020783,-0.744749,-0.494077,-0.0973
2,-1.507947,-0.349995,-0.03951,2.110094


In [10]:
df.tail(2)

Unnamed: 0,A,B,C,D
4,0.122866,-0.357009,-0.109226,-0.964451
5,1.376951,-0.636536,0.528513,0.427173


In [12]:
# You can view the index column
df.index

RangeIndex(start=0, stop=6, step=1)

In [14]:
# You can view the column names
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

### Note: A fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column.

In [15]:
# You can print out a summary of your data using the "describe" function in Pandas
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.149312,-0.68476,-0.438817,0.105606
std,1.348263,0.409266,0.763098,1.083226
min,-1.507947,-1.457541,-1.68259,-0.964451
25%,-0.761697,-0.717696,-0.750529,-0.445498
50%,-0.014454,-0.599632,-0.301652,-0.234563
75%,1.063429,-0.408439,-0.056939,0.296054
max,2.020783,-0.349995,0.528513,2.110094


In [17]:
# You can Transpose your data (ie switch rows with columns) by using the "Transpose" function
df.T

Unnamed: 0,0,1,2,3,4,5
A,-0.151773,2.020783,-1.507947,-0.965005,0.122866,1.376951
B,-1.457541,-0.744749,-0.349995,-0.562729,-0.357009,-0.636536
C,-1.68259,-0.494077,-0.03951,-0.836013,-0.109226,0.528513
D,-0.371826,-0.0973,2.110094,-0.470055,-0.964451,0.427173


In [24]:
# You can sort an dataframe by index. 
# Below, axis=0 means sort the rows, axis=1 means sort the columns.
# Below, ascending=True means arrange in ascending order, ascending=Fase means arrange in descending order
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
5,1.376951,-0.636536,0.528513,0.427173
4,0.122866,-0.357009,-0.109226,-0.964451
3,-0.965005,-0.562729,-0.836013,-0.470055
2,-1.507947,-0.349995,-0.03951,2.110094
1,2.020783,-0.744749,-0.494077,-0.0973
0,-0.151773,-1.457541,-1.68259,-0.371826


In [28]:
# As well as sorting by index, you can sort a dataframe by value
df.sort_values(by="C", ascending=False)

Unnamed: 0,A,B,C,D
5,1.376951,-0.636536,0.528513,0.427173
2,-1.507947,-0.349995,-0.03951,2.110094
4,0.122866,-0.357009,-0.109226,-0.964451
1,2.020783,-0.744749,-0.494077,-0.0973
3,-0.965005,-0.562729,-0.836013,-0.470055
0,-0.151773,-1.457541,-1.68259,-0.371826


## Selecting data

In [31]:
# You can select a single column
df['B']

0   -1.457541
1   -0.744749
2   -0.349995
3   -0.562729
4   -0.357009
5   -0.636536
Name: B, dtype: float64

In [60]:
# You can select certain rows
df[0:2]
# Note how 0:2 refers to rows 0 and 1. ie 2 means up to, but not including 2 (3rd row)

Unnamed: 0,A,B,C,D
0,-0.151773,-1.457541,-1.68259,-0.371826
1,2.020783,-0.744749,-0.494077,-0.0973


### Selecting by label - ".loc" function

In [61]:
# You can use the ".loc" function to locate a specific row
df.loc[3]
# Remember this is the 4th row, because the 1st row is row 0

A   -0.965005
B   -0.562729
C   -0.836013
D   -0.470055
Name: 3, dtype: float64

In [38]:
# You can use the ".loc" function to locate a specific column
df.loc[:, ['A']]

Unnamed: 0,A
0,-0.151773
1,2.020783
2,-1.507947
3,-0.965005
4,0.122866
5,1.376951


In [46]:
# You can use the ".loc" function to locate any number of rows or colums
df.loc[2:4, ['A','C']]

Unnamed: 0,A,C
2,-1.507947,-0.03951
3,-0.965005,-0.836013
4,0.122866,-0.109226


### Selecting by position - ".iloc" function

In [47]:
# You can also select on the basis on position in the dataframe
# Note: this yelds the same as df.loc[3] above
df.iloc[3]

A   -0.965005
B   -0.562729
C   -0.836013
D   -0.470055
Name: 3, dtype: float64

In [50]:
# You can select a certain column
# Note: This doesn't exactly yield the same result as df.loc[:, ['A']] above. The Heading is not included
df.iloc[ : , 0]

0   -0.151773
1    2.020783
2   -1.507947
3   -0.965005
4    0.122866
5    1.376951
Name: A, dtype: float64

In [52]:
# You can also select specific parts of the dataframe
# This code does not yiled ths same as "df.loc[2:4, ['A','C']]" above
df.iloc[2:4, 0:2]

Unnamed: 0,A,B
2,-1.507947,-0.349995
3,-0.965005,-0.562729


In [55]:
# But the below code DOES yield the same result as "df.loc[2:4, ['A','C']]" above
df.iloc[[2,3,4],[0,2]]

Unnamed: 0,A,C
2,-1.507947,-0.03951
3,-0.965005,-0.836013
4,0.122866,-0.109226


In [62]:
# For slicing rows explicitly
df.iloc[1:3,:]
# Note how the rows are up to, but not including the row number stated.

Unnamed: 0,A,B,C,D
1,2.020783,-0.744749,-0.494077,-0.0973
2,-1.507947,-0.349995,-0.03951,2.110094


In [65]:
# For slicing columns explicitly
df.iloc[:,1:3]

Unnamed: 0,B,C
0,-1.457541,-1.68259
1,-0.744749,-0.494077
2,-0.349995,-0.03951
3,-0.562729,-0.836013
4,-0.357009,-0.109226
5,-0.636536,0.528513


In [67]:
# For getting a value explicity
df.iloc[0,0]

-0.15177320419270537

### Note: Above, it id difficult to see the difference between ".loc" and ".iloc" functions. This is because the default index is 0, 1, 2,3,...... If the index was different (eg hours of a day, where the same nuber may be reappearing later in the dataframe index.
### .loc is used when referencing the index in the dataframe
### .iloc is used when referencing the row number in the dataframe

In [69]:
# If you wish to select certain values from a database. For instance, below will select rows from the database where the
# value in Column A is greter then zero
df[df.A > 0]

Unnamed: 0,A,B,C,D
1,2.020783,-0.744749,-0.494077,-0.0973
4,0.122866,-0.357009,-0.109226,-0.964451
5,1.376951,-0.636536,0.528513,0.427173


In [70]:
# If you wish to select only data that meets a certain criteria. For instance, below displays only values that are greater
# than zero
df[df > 0]

Unnamed: 0,A,B,C,D
0,,,,
1,2.020783,,,
2,,,,2.110094
3,,,,
4,0.122866,,,
5,1.376951,,0.528513,0.427173


In [72]:
# You can copy a DataFrame
df2 = df.copy()

In [73]:
df2

Unnamed: 0,A,B,C,D
0,-0.151773,-1.457541,-1.68259,-0.371826
1,2.020783,-0.744749,-0.494077,-0.0973
2,-1.507947,-0.349995,-0.03951,2.110094
3,-0.965005,-0.562729,-0.836013,-0.470055
4,0.122866,-0.357009,-0.109226,-0.964451
5,1.376951,-0.636536,0.528513,0.427173


In [75]:
# You can enter a new colum into a dataframe
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [76]:
df2

Unnamed: 0,A,B,C,D,E
0,-0.151773,-1.457541,-1.68259,-0.371826,one
1,2.020783,-0.744749,-0.494077,-0.0973,one
2,-1.507947,-0.349995,-0.03951,2.110094,two
3,-0.965005,-0.562729,-0.836013,-0.470055,three
4,0.122866,-0.357009,-0.109226,-0.964451,four
5,1.376951,-0.636536,0.528513,0.427173,three


In [77]:
# Using the "isin" function, you can select certain data
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2,-1.507947,-0.349995,-0.03951,2.110094,two
4,0.122866,-0.357009,-0.109226,-0.964451,four
