## DataFrame (the Real Meat!)

In [2]:
from numpy.random import randn as rn
import numpy as np
import pandas as pd

## Creating and accessing DataFrame
* Indexing
* Adding and deleting rows and columns
* Subsetting DataFrame

In [4]:
np.random.seed(101) #seed value = 10, now we used it for getting same random nmber for every time...
matrix_data = rn(5,4)
row_label = ['A','B','C','D','E']
column_heading = ['W','X','Y','Z']

df = pd.DataFrame(matrix_data, index=row_label, columns=column_heading)
print("\nThe data frame looks like\n",'-'*45, sep='')
print(df)


The data frame looks like
---------------------------------------------
          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
B  0.651118 -0.319318 -0.848077  0.605965
C -2.018168  0.740122  0.528813 -0.589001
D  0.188695 -0.758872 -0.933237  0.955057
E  0.190794  1.978757  2.605967  0.683509


In [8]:
np.random.seed(101)
matrix_data = rn(5,4)
matrix_data

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

### Indexing and slicing (columns)
* By bracket method
* By DOT method (NOT recommended)

In [9]:
print("\nThe 'X' column\n",'-'*25, sep='')
print(df['X'])
print("\nType of the column: ", type(df['X']), sep='')
print("\nThe 'X' and 'Z' columns indexed by passing a list\n",'-'*55, sep='')
print(df[['X','Z']])
print("\nType of the pair of columns: ", type(df[['X','Z']]), sep='')
print ("\nSo, for more than one column, the object turns into a DataFrame")


The 'X' column
-------------------------
A    0.628133
B   -0.319318
C    0.740122
D   -0.758872
E    1.978757
Name: X, dtype: float64

Type of the column: <class 'pandas.core.series.Series'>

The 'X' and 'Z' columns indexed by passing a list
-------------------------------------------------------
          X         Z
A  0.628133  0.503826
B -0.319318  0.605965
C  0.740122 -0.589001
D -0.758872  0.955057
E  1.978757  0.683509

Type of the pair of columns: <class 'pandas.core.frame.DataFrame'>

So, for more than one column, the object turns into a DataFrame


In [10]:
print("\nThe 'X' column accessed by DOT method (NOT recommended)\n",'-'*55, sep='')
print(df.X)


The 'X' column accessed by DOT method (NOT recommended)
-------------------------------------------------------
A    0.628133
B   -0.319318
C    0.740122
D   -0.758872
E    1.978757
Name: X, dtype: float64


### Creating and deleting a (new) column (or row)

In [13]:
print("\nA column is created by assigning it in relation to an existing column\n",'-'*75, sep='')
df['New'] = df['X']+df['Z']
df['New (Sum of X and Z)'] = df['X']+df['Z']
print(df)
print("\nA column is dropped by using df.drop() method\n",'-'*55, sep='')
df = df.drop('New', axis=1) # Notice the axis=1 option, axis = 0 is default, so one has to change it to 1
print(df)
df1=df.drop('A')
print("\nA row (index) is dropped by using df.drop() method and axis=0\n",'-'*65, sep='')
print(df1)
print("\nAn in-place change can be done by making inplace=True in the drop method\n",'-'*75, sep='')
df.drop('New (Sum of X and Z)', axis=1, inplace=True)
print(df)


A column is created by assigning it in relation to an existing column
---------------------------------------------------------------------------
          W         X         Y         Z  New (Sum of X and Z)       New
A  2.706850  0.628133  0.907969  0.503826              1.131958  1.131958
B  0.651118 -0.319318 -0.848077  0.605965              0.286647  0.286647
C -2.018168  0.740122  0.528813 -0.589001              0.151122  0.151122
D  0.188695 -0.758872 -0.933237  0.955057              0.196184  0.196184
E  0.190794  1.978757  2.605967  0.683509              2.662266  2.662266

A column is dropped by using df.drop() method
-------------------------------------------------------
          W         X         Y         Z  New (Sum of X and Z)
A  2.706850  0.628133  0.907969  0.503826              1.131958
B  0.651118 -0.319318 -0.848077  0.605965              0.286647
C -2.018168  0.740122  0.528813 -0.589001              0.151122
D  0.188695 -0.758872 -0.933237  0.955057         

### Selecting/indexing Rows
* Label-based 'loc' method
* Index (numeric) 'iloc' method

In [16]:
print("\nLabel-based 'loc' method can be used for selecting row(s)\n",'-'*60, sep='')
print("\nSingle row\n")
print(df.loc['C'])
print("\nMultiple rows\n")
print(df.loc[['B','C']])
print("\nIndex position based 'iloc' method can be used for selecting row(s)\n",'-'*70, sep='')
print("\nSingle row\n")
print(df.iloc[2])
print("\nMultiple rows\n")
print(df.iloc[[1,2]])


Label-based 'loc' method can be used for selecting row(s)
------------------------------------------------------------

Single row

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

Multiple rows

          W         X         Y         Z
B  0.651118 -0.319318 -0.848077  0.605965
C -2.018168  0.740122  0.528813 -0.589001

Index position based 'iloc' method can be used for selecting row(s)
----------------------------------------------------------------------

Single row

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

Multiple rows

          W         X         Y         Z
B  0.651118 -0.319318 -0.848077  0.605965
C -2.018168  0.740122  0.528813 -0.589001


### Subsetting DataFrame

In [18]:
np.random.seed(101)
matrix_data = rn(5,4)
row_labels = ['A','B','C','D','E']
column_headings = ['W','X','Y','Z']
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)

print("\nThe DatFrame\n",'-'*45, sep='')
print(df)
print("\nElement at row 'B' and column 'Y' is\n") 
print(df.loc['B','Y'])
print("\nSubset comprising of rows B and D, and columns W and Y, is\n")
df.loc[['B','D'],['W','Y']]


The DatFrame
---------------------------------------------
          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
B  0.651118 -0.319318 -0.848077  0.605965
C -2.018168  0.740122  0.528813 -0.589001
D  0.188695 -0.758872 -0.933237  0.955057
E  0.190794  1.978757  2.605967  0.683509

Element at row 'B' and column 'Y' is

-0.8480769834036315

Subset comprising of rows B and D, and columns W and Y, is



Unnamed: 0,W,Y
B,0.651118,-0.848077
D,0.188695,-0.933237
