# DataFrames

1.DataFrames are the workhorse of pandas and are directly inspired by the R programming language.

2.We can think of a DataFrame as a bunch of Series objects put together to share the same index. 

In [None]:
import pandas as pd
import numpy as np
from numpy.random import randn

In [None]:
df = pd.DataFrame(randn(5,4))
df

Unnamed: 0,0,1,2,3
0,-0.582607,-0.28499,0.423276,-0.513421
1,0.560265,-0.185103,-0.345445,-0.024035
2,-2.337827,0.528979,-1.376544,-1.835775
3,-0.357551,-0.008789,-0.09054,1.055292
4,-0.246316,0.388286,-1.260201,-0.171296


In [None]:
'W X Y Z'.split()

['W', 'X', 'Y', 'Z']

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame(randn(5,4),index=['A','B','C','D' ,'E'],
                  columns='W X Y Z'.split())

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-0.767763,0.660504,0.668966,-0.463456
B,1.255684,-0.064614,0.256751,-1.808088
C,-1.002467,0.92307,0.722617,1.314495
D,-2.068051,0.985353,-0.034651,-0.089781
E,-0.544011,0.099837,-0.636896,0.293113


In [None]:
adv=pd.read_csv(r"C:\Users\home\Desktop\Machine Learning\datasets\Advertising.csv")

In [None]:
adv

Unnamed: 0,Month,TV,radio,newspaper,sales
0,Jan-00,230.1,37.8,69.2,22.1
1,Feb-00,44.5,39.3,45.1,10.4
2,Mar-00,17.2,45.9,69.3,9.3
3,Apr-00,151.5,41.3,58.5,18.5
4,May-00,180.8,10.8,58.4,12.9
...,...,...,...,...,...
195,Apr-16,38.2,3.7,13.8,7.6
196,May-16,94.2,4.9,8.1,9.7
197,Jun-16,177.0,9.3,6.4,12.8
198,Jul-16,283.6,42.0,66.2,25.5


## Selection and Indexing

methods to grab data from a DataFrame

In [None]:
Advertising["Date"]=pd.to_datetime(Advertising["Date"])

A   -0.767763
B    1.255684
C   -1.002467
D   -2.068051
E   -0.544011
Name: W, dtype: float64

In [None]:
print(type(df["W"]))

<class 'pandas.core.series.Series'>


In [None]:
print(df[['W',"Y"]])
type(df[['W',"Y"]])

          W         Y
A  1.445210  1.439274
B -0.702201 -0.165721
C  1.166843  0.190403
D  1.043863  0.491796
E  0.341416 -0.540053


pandas.core.frame.DataFrame

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-0.767763,0.660504,0.668966,-0.463456
B,1.255684,-0.064614,0.256751,-1.808088
C,-1.002467,0.92307,0.722617,1.314495
D,-2.068051,0.985353,-0.034651,-0.089781
E,-0.544011,0.099837,-0.636896,0.293113


In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-0.767763,0.660504,0.668966,-0.463456
B,1.255684,-0.064614,0.256751,-1.808088
C,-1.002467,0.92307,0.722617,1.314495
D,-2.068051,0.985353,-0.034651,-0.089781
E,-0.544011,0.099837,-0.636896,0.293113


DataFrame Columns are just Series

In [None]:
type(df['W'])

pandas.core.series.Series

**Creating a new column:**

In [None]:
df['new'] = df['W'] + df['Y']

In [None]:
df['new1'] = [1,np.nan,3,4,5]

In [None]:
df

Unnamed: 0,W,X,Y,Z,new1
A,1.44521,1.301137,1.439274,0.169159,1.0
B,-0.702201,0.197284,-0.165721,-0.342425,
C,1.166843,-0.123494,0.190403,1.834043,3.0
D,1.043863,0.327758,0.491796,-0.021619,4.0
E,0.341416,-0.105648,-0.540053,0.05722,5.0


In [None]:
randn(5,2)

array([[-0.88079861, -0.76467712],
       [-1.16502789,  0.05966052],
       [ 0.15418213, -0.26669924],
       [-0.63395441, -0.77129099],
       [-0.22988838, -0.68072576]])

In [None]:
df['new1'] = randn(5,2)

In [None]:
df

Unnamed: 0,W,X,Y,Z,new,new1
A,-0.767763,0.660504,0.668966,-0.463456,-0.098796,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,1.512435,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,-0.279851,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,-2.102702,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-1.180907,-0.277772


** Removing Columns**

In [None]:
df.drop('A')

Unnamed: 0,W,X,Y,Z,new1
B,-0.702201,0.197284,-0.165721,-0.342425,
C,1.166843,-0.123494,0.190403,1.834043,3.0
D,1.043863,0.327758,0.491796,-0.021619,4.0
E,0.341416,-0.105648,-0.540053,0.05722,5.0


In [None]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z,new,new1
A,-0.767763,0.660504,0.668966,-0.463456,-0.098796,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,1.512435,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,-0.279851,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,-2.102702,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-1.180907,-0.277772


In [None]:
dfd=df.drop("B")

In [None]:
dfd

Unnamed: 0,W,X,Y,Z,new,new1
A,-0.767763,0.660504,0.668966,-0.463456,-0.098796,-1.327585
C,-1.002467,0.92307,0.722617,1.314495,-0.279851,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,-2.102702,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-1.180907,-0.277772


In [None]:
df.drop('new',axis=1,inplace=True)

In [None]:
df

Unnamed: 0,W,X,Y,Z,new1
A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


Can also drop rows this way:

In [None]:
df.drop('E',axis=0)

Unnamed: 0,W,X,Y,Z,new1
A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,1.213534


** Selecting Rows**

In [None]:
df

Unnamed: 0,W,X,Y,Z,new1
A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


In [None]:
df.loc[['A',"B"],]

Unnamed: 0,W,X,Y,Z,new1
A,1.44521,1.301137,1.439274,0.169159,1.0
B,-0.702201,0.197284,-0.165721,-0.342425,


Or select based off of position instead of label 

In [None]:
df

Unnamed: 0,W,X,Y,Z,new1
A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


In [None]:
#df.iloc['W','A']
df.iloc[2:4,1:]

Unnamed: 0,X,Y,Z,new1
C,-0.123494,0.190403,1.834043,3.0
D,0.327758,0.491796,-0.021619,4.0


** Selecting subset of rows and columns **

In [None]:
df

Unnamed: 0,index,W,X,Y,Z,new1
0,A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
1,B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
2,C,-1.002467,0.92307,0.722617,1.314495,1.276368
3,D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
4,E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


In [None]:
df.loc['B','Y']

0.2567508599091483

In [None]:
df.loc[['A','C'],['X','Z']]

Unnamed: 0,X,Z
A,0.660504,-0.463456
C,0.92307,1.314495


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [None]:
df['name']=['amas','tama','aaap','cap','qw']
df

In [None]:
df.drop("name",axis =1,inplace = True)

In [None]:
df>0

Unnamed: 0,W,X,Y,Z,new1
A,True,True,True,True,True
B,False,True,False,False,False
C,True,False,True,True,True
D,True,True,True,False,True
E,True,False,False,True,True


In [None]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,0.137818,0.780491,0.145414,
B,,0.396979,0.169224,
C,2.36866,2.439409,,
D,0.321878,0.562053,,0.497364
E,,0.907835,0.914738,


In [None]:
df['W']==0


A    False
B    False
C    False
D    False
E    False
Name: W, dtype: bool

In [None]:
df

Unnamed: 0,0,1,2,3
0,-0.582607,-0.28499,0.423276,-0.513421
1,0.560265,-0.185103,-0.345445,-0.024035
2,-2.337827,0.528979,-1.376544,-1.835775
3,-0.357551,-0.008789,-0.09054,1.055292
4,-0.246316,0.388286,-1.260201,-0.171296


In [None]:
df[df['W']>0]["Z"]

A   -0.621692
C   -1.270337
D    0.497364
Name: Z, dtype: float64

In [None]:
abc

A   -0.621692
C   -1.270337
D    0.497364
Name: Z, dtype: float64

In [None]:
df[df['W']>0][['Y','X',"W"]]

Unnamed: 0,Y,X,W
B,0.256751,-0.064614,1.255684


In [None]:
df.dropna

For two conditions you can use | and & with parenthesis:

In [None]:
df[(df['W']>0)]

## More Index Details

More features of indexing, including resetting the index or setting it something else. 

In [None]:
df

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,0.137818,0.780491,0.145414,-0.621692
B,-0.798901,0.396979,0.169224,-0.865485
C,2.36866,2.439409,-0.128689,-1.270337
D,0.321878,0.562053,-1.132569,0.497364
E,-0.382079,0.907835,0.914738,-1.263335


In [None]:
# Reset to default 0,1...n index
df.reset_index(inplace = True)

In [None]:
df

Unnamed: 0,index,W,X,Y,Z
0,A,0.137818,0.780491,0.145414,-0.621692
1,B,-0.798901,0.396979,0.169224,-0.865485
2,C,2.36866,2.439409,-0.128689,-1.270337
3,D,0.321878,0.562053,-1.132569,0.497364
4,E,-0.382079,0.907835,0.914738,-1.263335


In [None]:
df.columns

Index(['level_0', 'index', 'W', 'X', 'Y', 'Z', 'new1'], dtype='object')

In [None]:
newind = 'CA NY WY OR CO'.split()

In [None]:
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [None]:
df['States'] = newind

In [None]:
df

Unnamed: 0,index,W,X,Y,Z,States
0,A,0.137818,0.780491,0.145414,-0.621692,CA
1,B,-0.798901,0.396979,0.169224,-0.865485,NY
2,C,2.36866,2.439409,-0.128689,-1.270337,WY
3,D,0.321878,0.562053,-1.132569,0.497364,OR
4,E,-0.382079,0.907835,0.914738,-1.263335,CO


In [None]:
df.set_index('States' ,inplace = True)

In [None]:
df

Unnamed: 0_level_0,index,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,A,0.137818,0.780491,0.145414,-0.621692
NY,B,-0.798901,0.396979,0.169224,-0.865485
WY,C,2.36866,2.439409,-0.128689,-1.270337
OR,D,0.321878,0.562053,-1.132569,0.497364
CO,E,-0.382079,0.907835,0.914738,-1.263335


In [None]:
df.reset_index(inplace = True)

In [None]:
df.set_index('States',inplace=True)

In [None]:
df

# END

In [None]:
import numpy as np

In [None]:
a=np.array([[1,3,4],[9,0,1],[1,0,3]])

In [None]:
a

In [None]:
a==1

In [None]:
a[a==1]=22

In [None]:
a

In [None]:
df.iloc[1]

In [None]:
pd.factorize(df.iloc[1])

In [None]:
d1={"Gender":['M','F','M','M','F'],"Age":[10,23,22,12,33]}