# DataFrames

In [1]:
import pandas as pd
import numpy as np

In [2]:
from numpy.random import randn
np.random.seed(101)

In [7]:
'A B C D E'.split()

['A', 'B', 'C', 'D', 'E']

In [4]:
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


## Selection and Indexing

Let's learn the various methods to grab data from a DataFrame
- df['W']
- df[['W','Z']]
- df.W -- SQL syntax - not recommended

In [11]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [13]:
df.loc['A']

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [15]:
df[['W', 'X']]

Unnamed: 0,W,X
A,0.302665,1.693723
B,-0.134841,0.390528
C,0.807706,0.07296
D,-0.497104,-0.75407
E,-0.116773,1.901755


In [16]:
df[['W']]

Unnamed: 0,W
A,0.302665
B,-0.134841
C,0.807706
D,-0.497104
E,-0.116773


DataFrame Columns are just Series
- check the type of df['W']

**Creating a new column:**
- new column by addition of two columns df['W'] + df['Y']
- new column by assigning a scalar value df['age']=40

In [18]:
df['total'] = df['W'] * df['X']

In [19]:
df

Unnamed: 0,W,X,Y,Z,total
A,0.302665,1.693723,-1.706086,-1.159119,0.512631
B,-0.134841,0.390528,0.166905,0.184502,-0.052659
C,0.807706,0.07296,0.638787,0.329646,0.05893
D,-0.497104,-0.75407,-0.943406,0.484752,0.374851
E,-0.116773,1.901755,0.238127,1.996652,-0.222074


In [20]:
df['age'] = 18

In [21]:
df

Unnamed: 0,W,X,Y,Z,total,age
A,0.302665,1.693723,-1.706086,-1.159119,0.512631,18
B,-0.134841,0.390528,0.166905,0.184502,-0.052659,18
C,0.807706,0.07296,0.638787,0.329646,0.05893,18
D,-0.497104,-0.75407,-0.943406,0.484752,0.374851,18
E,-0.116773,1.901755,0.238127,1.996652,-0.222074,18


** Removing Columns**
- df.drop(<name of the column>, axis = 1)
- df.drop(<name of the row>, axis = 0)
- use inplace for permanent drop

In [23]:
%config IPCompleter.greedy=True

In [31]:
df.drop('B')

Unnamed: 0,W,X,Y,Z,total,age
A,0.302665,1.693723,-1.706086,-1.159119,0.512631,18
C,0.807706,0.07296,0.638787,0.329646,0.05893,18
D,-0.497104,-0.75407,-0.943406,0.484752,0.374851,18
E,-0.116773,1.901755,0.238127,1.996652,-0.222074,18


In [30]:
df.drop('age', axis = 1)

Unnamed: 0,W,X,Y,Z,total
A,0.302665,1.693723,-1.706086,-1.159119,0.512631
B,-0.134841,0.390528,0.166905,0.184502,-0.052659
C,0.807706,0.07296,0.638787,0.329646,0.05893
D,-0.497104,-0.75407,-0.943406,0.484752,0.374851
E,-0.116773,1.901755,0.238127,1.996652,-0.222074


In [33]:
df.drop('B', inplace=True)

In [34]:
df

Unnamed: 0,W,X,Y,Z,total,age
A,0.302665,1.693723,-1.706086,-1.159119,0.512631,18
C,0.807706,0.07296,0.638787,0.329646,0.05893,18
D,-0.497104,-0.75407,-0.943406,0.484752,0.374851,18
E,-0.116773,1.901755,0.238127,1.996652,-0.222074,18


In [35]:
df.drop('age', axis=1, inplace=True)

In [36]:
df

Unnamed: 0,W,X,Y,Z,total
A,0.302665,1.693723,-1.706086,-1.159119,0.512631
C,0.807706,0.07296,0.638787,0.329646,0.05893
D,-0.497104,-0.75407,-0.943406,0.484752,0.374851
E,-0.116773,1.901755,0.238127,1.996652,-0.222074


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [42]:
df

Unnamed: 0,W,X,Y,Z,total
A,0.302665,1.693723,-1.706086,-1.159119,0.512631
C,0.807706,0.07296,0.638787,0.329646,0.05893
D,-0.497104,-0.75407,-0.943406,0.484752,0.374851
E,-0.116773,1.901755,0.238127,1.996652,-0.222074


In [40]:
df[df> 0]

Unnamed: 0,W,X,Y,Z,total
A,0.302665,1.693723,,,0.512631
C,0.807706,0.07296,0.638787,0.329646,0.05893
D,,,,0.484752,0.374851
E,,1.901755,0.238127,1.996652,


In [44]:
df[df['W'] >0]

Unnamed: 0,W,X,Y,Z,total
A,0.302665,1.693723,-1.706086,-1.159119,0.512631
C,0.807706,0.07296,0.638787,0.329646,0.05893


In [45]:
df[ (df['W'] >0) & (df['Y']>0)]

Unnamed: 0,W,X,Y,Z,total
C,0.807706,0.07296,0.638787,0.329646,0.05893
