#### A DataFrame is a two-dimensional data structure which stores the data in the tabular format i.e. in the rows and columns

### Creating DataFrame
##### DataFrames can be created using various inputs : Lists, Dictionary, Series, Numpyndarrays, Dataframes

##### Creating DataFrame using List

In [1]:
import pandas as pd
my_list = [1,2,3,4,5]
df = pd.DataFrame(my_list)
df

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5


In [2]:
my_list1 = [['Raj', 97],['Seena',89],['Kanu',85],['Vijay',80],['Manu',75]]
df = pd.DataFrame(my_list1, columns=['Students','Marks'])
df

Unnamed: 0,Students,Marks
0,Raj,97
1,Seena,89
2,Kanu,85
3,Vijay,80
4,Manu,75


### DataFrame using dict/Series

In [3]:
my_ser = {'Students': pd.Series(['Raj','Seena','Kanu','Vijay','Manu'],index = ['1st','2nd','3rd','4th','5th']), 'Marks': pd.Series([97,89,85,80,75],index = ['1st','2nd','3rd','4th','5th'])}
df = pd.DataFrame(my_ser)
df

Unnamed: 0,Students,Marks
1st,Raj,97
2nd,Seena,89
3rd,Kanu,85
4th,Vijay,80
5th,Manu,75


### Creating a DataFrame from ndarrays
###### The ndarrays must be of same length. If index is passed, then the length of the index should equal to the length of the arrays. Above example have the same length index & ndarrays

### DataFrame functions

In [5]:
import numpy as np
from numpy.random import randn
np.random.seed(101)

In [6]:
df=pd.DataFrame(np.random.rand(5,4))

In [7]:
df=pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())

In [8]:
df

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646
E,-0.497104,-0.75407,-0.943406,0.484752


# Selection and Indexing

In [9]:
df['W']

A    0.190794
B    0.302665
C   -0.134841
D    0.807706
E   -0.497104
Name: W, dtype: float64

In [10]:
df[['W','Y']]

Unnamed: 0,W,Y
A,0.190794,2.605967
B,0.302665,-1.706086
C,-0.134841,0.166905
D,0.807706,0.638787
E,-0.497104,-0.943406


In [11]:
df.W  # SQL Syntax (not recommended)

A    0.190794
B    0.302665
C   -0.134841
D    0.807706
E   -0.497104
Name: W, dtype: float64

In [12]:
type(df['W'])  # datFrame Columns are just Series

pandas.core.series.Series

### Creating a new coumn:

In [13]:
df['new'] = df['W'] + df['Y']

In [14]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.190794,1.978757,2.605967,0.683509,2.796762
B,0.302665,1.693723,-1.706086,-1.159119,-1.40342
C,-0.134841,0.390528,0.166905,0.184502,0.032064
D,0.807706,0.07296,0.638787,0.329646,1.446493
E,-0.497104,-0.75407,-0.943406,0.484752,-1.44051


### Removing Columns

In [15]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646
E,-0.497104,-0.75407,-0.943406,0.484752


In [16]:
df   # not inplace unless specified!

Unnamed: 0,W,X,Y,Z,new
A,0.190794,1.978757,2.605967,0.683509,2.796762
B,0.302665,1.693723,-1.706086,-1.159119,-1.40342
C,-0.134841,0.390528,0.166905,0.184502,0.032064
D,0.807706,0.07296,0.638787,0.329646,1.446493
E,-0.497104,-0.75407,-0.943406,0.484752,-1.44051


In [17]:
df.drop('new', axis=1, inplace=True)

In [18]:
df

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646
E,-0.497104,-0.75407,-0.943406,0.484752


In [19]:
df.drop('E', axis=0) # removing rows

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646


### **Selecting Rows**

In [20]:
df.loc['A']

W    0.190794
X    1.978757
Y    2.605967
Z    0.683509
Name: A, dtype: float64

In [21]:
df.iloc[2]  #also can be selected based on position of label

W   -0.134841
X    0.390528
Y    0.166905
Z    0.184502
Name: C, dtype: float64

In [22]:
df.iloc[4]

W   -0.497104
X   -0.754070
Y   -0.943406
Z    0.484752
Name: E, dtype: float64

In [23]:
df.iloc[:2]

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119


 ### Selecting subset of rows and columns

In [24]:
df.loc['B','W']  

0.3026654485851825

In [25]:
df.loc[['A','D'],['X','Z']]

Unnamed: 0,X,Z
A,1.978757,0.683509
D,0.07296,0.329646


### Conditional Selection

In [26]:
df

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646
E,-0.497104,-0.75407,-0.943406,0.484752


In [27]:
df>0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,True,False,False
C,False,True,True,True
D,True,True,True,True
E,False,False,False,True


In [28]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,,
C,,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646
E,,,,0.484752


In [29]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
D,0.807706,0.07296,0.638787,0.329646


In [30]:
df[df['Y']>0]

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646


In [31]:
df[df['X']>0]

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646


In [32]:
df

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646
E,-0.497104,-0.75407,-0.943406,0.484752


In [33]:
df[df['W']>0]['Y']

A    2.605967
B   -1.706086
D    0.638787
Name: Y, dtype: float64

In [34]:
df[df['X']<1]

Unnamed: 0,W,X,Y,Z
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646
E,-0.497104,-0.75407,-0.943406,0.484752


In [35]:
df[df['X']<1]['Z']

C    0.184502
D    0.329646
E    0.484752
Name: Z, dtype: float64

In [36]:
df[df['X']<1][['Y','W']]

Unnamed: 0,Y,W
C,0.166905,-0.134841
D,0.638787,0.807706
E,-0.943406,-0.497104


In [37]:
df[(df['X']>1) & (df['Y']>0)]  #both conditions should be satisfied

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509


##### For two conditions you can use & with parenthesis :

## More Index details

In [38]:
df

Unnamed: 0,W,X,Y,Z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646
E,-0.497104,-0.75407,-0.943406,0.484752


In [39]:
df.reset_index()  # reset to default 0,1,2,...n index

Unnamed: 0,index,W,X,Y,Z
0,A,0.190794,1.978757,2.605967,0.683509
1,B,0.302665,1.693723,-1.706086,-1.159119
2,C,-0.134841,0.390528,0.166905,0.184502
3,D,0.807706,0.07296,0.638787,0.329646
4,E,-0.497104,-0.75407,-0.943406,0.484752


In [40]:
newind = 'CA NY WY OR CO'.split()

In [41]:
df['States'] = newind

In [42]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.190794,1.978757,2.605967,0.683509,CA
B,0.302665,1.693723,-1.706086,-1.159119,NY
C,-0.134841,0.390528,0.166905,0.184502,WY
D,0.807706,0.07296,0.638787,0.329646,OR
E,-0.497104,-0.75407,-0.943406,0.484752,CO


In [43]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.190794,1.978757,2.605967,0.683509
NY,0.302665,1.693723,-1.706086,-1.159119
WY,-0.134841,0.390528,0.166905,0.184502
OR,0.807706,0.07296,0.638787,0.329646
CO,-0.497104,-0.75407,-0.943406,0.484752


In [44]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.190794,1.978757,2.605967,0.683509,CA
B,0.302665,1.693723,-1.706086,-1.159119,NY
C,-0.134841,0.390528,0.166905,0.184502,WY
D,0.807706,0.07296,0.638787,0.329646,OR
E,-0.497104,-0.75407,-0.943406,0.484752,CO


In [45]:
df.set_index('States',inplace=True)

In [46]:
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.190794,1.978757,2.605967,0.683509
NY,0.302665,1.693723,-1.706086,-1.159119
WY,-0.134841,0.390528,0.166905,0.184502
OR,0.807706,0.07296,0.638787,0.329646
CO,-0.497104,-0.75407,-0.943406,0.484752


# Multi-Index and Index Hierarchy

In [47]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [48]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [49]:
df=pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.116773,1.901755
G1,2,0.238127,1.996652
G1,3,-0.993263,0.1968
G2,1,-1.136645,0.000366
G2,2,1.025984,-0.156598
G2,3,-0.031579,0.649826


In [50]:
df.loc['G1']

Unnamed: 0,A,B
1,-0.116773,1.901755
2,0.238127,1.996652
3,-0.993263,0.1968


In [51]:
df.loc['G1'].loc[1]

A   -0.116773
B    1.901755
Name: 1, dtype: float64

In [52]:
df.index.names

FrozenList([None, None])

In [53]:
df.index.names=['Group','Num']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.116773,1.901755
G1,2,0.238127,1.996652
G1,3,-0.993263,0.1968
G2,1,-1.136645,0.000366
G2,2,1.025984,-0.156598
G2,3,-0.031579,0.649826


In [54]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.116773,1.901755
2,0.238127,1.996652
3,-0.993263,0.1968


In [55]:
df.xs(['G1',1])

A   -0.116773
B    1.901755
Name: (G1, 1), dtype: float64

In [56]:
df.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.116773,1.901755
G2,-1.136645,0.000366


In [57]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.116773,1.901755
G1,2,0.238127,1.996652
G1,3,-0.993263,0.1968
G2,1,-1.136645,0.000366
G2,2,1.025984,-0.156598
G2,3,-0.031579,0.649826
