# DataFrame


## Quick Start

In [30]:
import pandas as pd

test_df = pd.DataFrame(data=
            {
                "AAA":[1,2,3,4],
                "BBB":[5,6,7,8],
                "CCC":[9,10,11,12]
            },index=["A","B","C","D"])
test_df

Unnamed: 0,AAA,BBB,CCC
A,1,5,9
B,2,6,10
C,3,7,11
D,4,8,12


### if-else
key word: loc

In [9]:
test_df.loc[test_df.AAA>=2,["BBB","CCC"]] = 10
test_df

Unnamed: 0,AAA,BBB,CCC
A,1,5,9
B,2,10,10
C,3,10,10
D,4,10,10


### if-then-else
key word: where
1. pandas DataFrame where
2. numpy where

In [28]:
import numpy as np

test_df = pd.DataFrame(data=
            {
                "AAA":[1,2,3,4],
                "BBB":[5,6,7,8],
                "CCC":[9,10,11,12]
            },index=["A","B","C","D"])

test_df["logic"] = np.where(test_df["AAA"]>2,"high","low")
test_df

Unnamed: 0,AAA,BBB,CCC,logic
A,1,5,9,low
B,2,6,10,low
C,3,7,11,high
D,4,8,12,high


In [None]:
test_df["new_logic"] = test_df.where()

In [23]:
# numpy where
# np.where(condition, x, y)
aa = 5 * np.ones(10)
xx = np.linspace(2,10,20)
yy = np.arange(11,25,0.5)

l = [x if x<a else y for a,x,y in zip(aa,xx,yy)]
l

[2.0,
 2.4210526315789473,
 2.8421052631578947,
 3.263157894736842,
 3.6842105263157894,
 4.105263157894736,
 4.526315789473684,
 4.947368421052632,
 15.0,
 15.5]

In [68]:
# pandas DataFrame虽然是二维numpy array 
# 但是其数据定位仍旧是行+列的双重格式
test_df = pd.DataFrame(data=
            {
                "AAA":[1,2,3,4],
                "BBB":[5,6,7,8],
                "CCC":[9,10,11,12]
            },index=["A","B","C","D"])

c1 = np.array([1,0,0,1],dtype=np.bool_)
c2 = np.array([0,0,0,1],dtype=np.bool_)
c3 = np.array([0,0,1,0],dtype=np.bool_)
where_mask = np.stack([c1,c2,c3],axis=1)
where_masks = pd.DataFrame(where_mask,columns=["AAA","BBB","CCC"],index=["A","B","C","D"])

test_df.where(where_masks,10)

Unnamed: 0,AAA,BBB,CCC
A,1,10,10
B,10,10,10
C,10,10,11
D,4,8,10


### df.loc iloc []
1. df.loc[] -- label oriented 是闭区间
2. df.iloc[] -- position oriented 是右开区间
3. df[] 是选择，第一个参数是条件，第二个参数是列范围
4. df.index.isin() -- 一个很好用的方法，通过inedx进行选择

In [74]:
test_array = np.random.normal(0,1,(4,3))
test_df = pd.DataFrame(data=test_array,columns=["AAA","BBB","CCC"],index=["A","B","C","D"])

test_df

Unnamed: 0,AAA,BBB,CCC
A,0.39532,-0.36142,0.563441
B,-1.306173,1.958097,-0.851067
C,-0.644516,0.638778,0.098307
D,1.622761,-0.804671,-0.034382


In [79]:
test_df[(test_df.AAA>0.3)&(test_df.index.isin(["A","C"]))]

Unnamed: 0,AAA,BBB,CCC
A,0.39532,-0.36142,0.563441


### dynamic column creation
1. python built in function: **apply applymap**
2. pandas column inner logic: just as list - **DataFrame.columns**
3. pandas **lambda function** - functional programming

In [84]:
col_df = pd.DataFrame(data={
    "AAA": [1,2,4,6],
    "BBB": [1,3,5,2],
    "CCC": [2,6,3,4]
})

col_df

Unnamed: 0,AAA,BBB,CCC
0,1,1,2
1,2,3,6
2,4,5,3
3,6,2,4


In [87]:
categories = {
    1: "Cat",
    2: "Dog",
    3: "Lizard",
    4: "Snake",
    5: "Rabbit",
    6: "Goldfish"
}

source_col = col_df.columns
new_col = [str(x)+"_pet" sfor x in source_col]
col_df[new_col] = col_df[source_col].applymap(categories.get)

col_df

Unnamed: 0,AAA,BBB,CCC,AAA_pet,BBB_pet,CCC_pet
0,1,1,2,Cat,Cat,Dog
1,2,3,6,Dog,Lizard,Goldfish
2,4,5,3,Snake,Rabbit,Lizard
3,6,2,4,Goldfish,Dog,Snake


### apply applymap [only used for pandas, depricated in python]
1. apply is mainly used for Series - which is only 1D array
2. applymap is mainly used for DataFrame - which is 2D array
3. new function **numpy.random.shuffle**: return None, just shuffle data from original array

In [109]:
apply_array = np.linspace(10,30,48)
np.random.shuffle(apply_array)
apply_array = apply_array.reshape(6,8)

apply_df = pd.DataFrame(data=apply_array)
apply_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,22.340426,11.702128,14.255319,24.042553,16.808511,20.638298,20.212766,18.93617
1,15.531915,12.12766,24.893617,18.510638,17.659574,27.021277,22.765957,10.0
2,10.851064,19.361702,29.148936,15.106383,25.319149,13.404255,18.085106,23.191489
3,27.446809,17.234043,10.425532,26.170213,28.723404,21.914894,12.553191,14.680851
4,23.617021,21.06383,30.0,24.468085,28.297872,21.489362,26.595745,19.787234
5,13.829787,12.978723,11.276596,15.957447,16.382979,27.87234,25.744681,29.574468


In [110]:
# when axis=1, all operations are implemented over rows
apply_df.apply(lambda x: max(x) - min(x),axis=1)

0    12.340426
1    17.021277
2    18.297872
3    18.297872
4    10.212766
5    18.297872
dtype: float64

In [111]:
# when axis=0, all operations are implemented over columns
apply_df.apply(lambda x: max(x) - min(x), axis=0)

0    16.595745
1     9.361702
2    19.574468
3    11.063830
4    12.340426
5    14.468085
6    14.042553
7    19.574468
dtype: float64

In [113]:
apply_df.applymap(lambda x: 1 if x>20 else 0)

Unnamed: 0,0,1,2,3,4,5,6,7
0,1,0,0,1,0,1,1,0
1,0,0,1,0,0,1,1,0
2,0,0,1,0,1,0,0,1
3,1,0,0,1,1,1,0,0
4,1,1,1,1,1,1,1,0
5,0,0,0,0,0,1,1,1
