Pandas의 자료구조 - Series와 DataFrame

In [47]:
import numpy as np
import pandas as pd

In [5]:
obj = pd.Series([4, 7, -5, 3])

In [7]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [9]:
obj.values   #value로 하면 에러남

array([ 4,  7, -5,  3], dtype=int64)

In [11]:
obj.index # '='으로 하면 에러남

RangeIndex(start=0, stop=4, step=1)

In [12]:
obj.dtypes

dtype('int64')

In [14]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'e'])

In [17]:
obj2.index

Index(['d', 'b', 'a', 'e'], dtype='object')

In [19]:
obj2

d    4
b    7
a   -5
e    3
dtype: int64

In [24]:
sdata = {'Ch': 35000, 'Ki': 71000, 'Ha': 16000, 'Sa': 5000}
obj3 = pd.Series(sdata)

In [25]:
obj3

Ch    35000
Ki    71000
Ha    16000
Sa     5000
dtype: int64

In [26]:
obj3.name = 'Salary'
obj3.index.name = 'Names'
obj3

Names
Ch    35000
Ki    71000
Ha    16000
Sa     5000
Name: Salary, dtype: int64

In [27]:
obj3.index = ['A', 'B', 'C', 'D']

In [28]:
obj3

A    35000
B    71000
C    16000
D     5000
Name: Salary, dtype: int64

In [32]:
data = {'names': ["Kilho", 'Kilho', 'Kilho', 'Charles', 'Charles'],
       'year': [2014, 2015, 2016, 2015, 2016],
       'points': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data)

In [33]:
df

Unnamed: 0,names,year,points
0,Kilho,2014,1.5
1,Kilho,2015,1.7
2,Kilho,2016,3.6
3,Charles,2015,2.4
4,Charles,2016,2.9


In [34]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [35]:
df.columns

Index(['names', 'year', 'points'], dtype='object')

In [36]:
df.values

array([['Kilho', 2014, 1.5],
       ['Kilho', 2015, 1.7],
       ['Kilho', 2016, 3.6],
       ['Charles', 2015, 2.4],
       ['Charles', 2016, 2.9]], dtype=object)

In [38]:
df.index.name = 'Num'
df.columns.name = 'Info'

In [39]:
df

Info,names,year,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Kilho,2014,1.5
1,Kilho,2015,1.7
2,Kilho,2016,3.6
3,Charles,2015,2.4
4,Charles,2016,2.9


In [40]:
df2 = pd.DataFrame(data, columns=['year', 'name', 'points', 'penalty'],
                  index=['one', 'two', 'three', 'four', 'five'])

In [41]:
df2

Unnamed: 0,year,name,points,penalty
one,2014,,1.5,
two,2015,,1.7,
three,2016,,3.6,
four,2015,,2.4,
five,2016,,2.9,


In [42]:
df2.index

Index(['one', 'two', 'three', 'four', 'five'], dtype='object')

In [43]:
df2.columns

Index(['year', 'name', 'points', 'penalty'], dtype='object')

In [44]:
df2.values

array([[2014, nan, 1.5, nan],
       [2015, nan, 1.7, nan],
       [2016, nan, 3.6, nan],
       [2015, nan, 2.4, nan],
       [2016, nan, 2.9, nan]], dtype=object)

In [46]:
df2.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,2015.2,2.42
std,0.83666,0.864292
min,2014.0,1.5
25%,2015.0,1.7
50%,2015.0,2.4
75%,2016.0,2.9
max,2016.0,3.6


DataFrame 인덱싱 이해

In [50]:
data = {'names': ['Ki', 'Ki', 'Ki', 'Ch', 'Ch'],
       'year': [2014, 2015, 2016, 2015, 2016],
       'points': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data, columns=['year', 'names', 'points', 'penalty'],
                 index=['one', 'two', 'three', 'four', 'five'])

In [51]:
df

Unnamed: 0,year,names,points,penalty
one,2014,Ki,1.5,
two,2015,Ki,1.7,
three,2016,Ki,3.6,
four,2015,Ch,2.4,
five,2016,Ch,2.9,


In [52]:
df['year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [53]:
df.year

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [54]:
df.names

one      Ki
two      Ki
three    Ki
four     Ch
five     Ch
Name: names, dtype: object

In [56]:
df.penalty

one      NaN
two      NaN
three    NaN
four     NaN
five     NaN
Name: penalty, dtype: object

In [58]:
df[['year', 'points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2015,2.4
five,2016,2.9


In [59]:
df['penalty'] = 0.5

In [60]:
df

Unnamed: 0,year,names,points,penalty
one,2014,Ki,1.5,0.5
two,2015,Ki,1.7,0.5
three,2016,Ki,3.6,0.5
four,2015,Ch,2.4,0.5
five,2016,Ch,2.9,0.5


In [62]:
df['penalty'] = (0.1, 0.2, 0.3, 0.4, 0.5)

In [63]:
df

Unnamed: 0,year,names,points,penalty
one,2014,Ki,1.5,0.1
two,2015,Ki,1.7,0.2
three,2016,Ki,3.6,0.3
four,2015,Ch,2.4,0.4
five,2016,Ch,2.9,0.5


In [64]:
df['zeros'] = np.arange(5)

In [65]:
df

Unnamed: 0,year,names,points,penalty,zeros
one,2014,Ki,1.5,0.1,0
two,2015,Ki,1.7,0.2,1
three,2016,Ki,3.6,0.3,2
four,2015,Ch,2.4,0.4,3
five,2016,Ch,2.9,0.5,4


In [66]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [67]:
df['debt'] = val

In [68]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt
one,2014,Ki,1.5,0.1,0,
two,2015,Ki,1.7,0.2,1,-1.2
three,2016,Ki,3.6,0.3,2,
four,2015,Ch,2.4,0.4,3,-1.5
five,2016,Ch,2.9,0.5,4,-1.7


In [69]:
df['net_points'] = df['points'] - df['penalty']

In [70]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points
one,2014,Ki,1.5,0.1,0,,1.4
two,2015,Ki,1.7,0.2,1,-1.2,1.5
three,2016,Ki,3.6,0.3,2,,3.3
four,2015,Ch,2.4,0.4,3,-1.5,2.0
five,2016,Ch,2.9,0.5,4,-1.7,2.4


In [74]:
df['high_points'] = df['net_points'] > 2.0

In [75]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,high_points
one,2014,Ki,1.5,0.1,0,,1.4,False
two,2015,Ki,1.7,0.2,1,-1.2,1.5,False
three,2016,Ki,3.6,0.3,2,,3.3,True
four,2015,Ch,2.4,0.4,3,-1.5,2.0,False
five,2016,Ch,2.9,0.5,4,-1.7,2.4,True


In [76]:
del df['high_points']

In [77]:
del df['net_points']

In [78]:
del df['zeros']

In [79]:
df

Unnamed: 0,year,names,points,penalty,debt
one,2014,Ki,1.5,0.1,
two,2015,Ki,1.7,0.2,-1.2
three,2016,Ki,3.6,0.3,
four,2015,Ch,2.4,0.4,-1.5
five,2016,Ch,2.9,0.5,-1.7


In [80]:
df.columns

Index(['year', 'names', 'points', 'penalty', 'debt'], dtype='object')

In [82]:
df.index.name = 'Order'
df.columns.name = 'Info'

In [83]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014,Ki,1.5,0.1,
two,2015,Ki,1.7,0.2,-1.2
three,2016,Ki,3.6,0.3,
four,2015,Ch,2.4,0.4,-1.5
five,2016,Ch,2.9,0.5,-1.7


In [84]:
df[0:3]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014,Ki,1.5,0.1,
two,2015,Ki,1.7,0.2,-1.2
three,2016,Ki,3.6,0.3,


In [85]:
df['two':'four']  # 별로 권장되는 방법이 아님

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2015,Ki,1.7,0.2,-1.2
three,2016,Ki,3.6,0.3,
four,2015,Ch,2.4,0.4,-1.5


In [86]:
# .loc, .iloc 를 사용하기 바람

df.loc['two']

Info
year       2015
names        Ki
points      1.7
penalty     0.2
debt       -1.2
Name: two, dtype: object

In [87]:
df.loc['two':'four']

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2015,Ki,1.7,0.2,-1.2
three,2016,Ki,3.6,0.3,
four,2015,Ch,2.4,0.4,-1.5


In [88]:
df.loc['two':'four', 'points']

Order
two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [89]:
df.loc[:, 'year']

Order
one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [90]:
df['year']

Order
one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [91]:
df.loc[:, ['year', 'names']]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,2014,Ki
two,2015,Ki
three,2016,Ki
four,2015,Ch
five,2016,Ch


In [93]:
df.loc['three':'five', 'names':'penalty']

Info,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
three,Ki,3.6,0.3
four,Ch,2.4,0.4
five,Ch,2.9,0.5


In [94]:
df.loc['six', :] = [2013, 'Ha', 4.0, 0.1, 2.1]

In [95]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014.0,Ki,1.5,0.1,
two,2015.0,Ki,1.7,0.2,-1.2
three,2016.0,Ki,3.6,0.3,
four,2015.0,Ch,2.4,0.4,-1.5
five,2016.0,Ch,2.9,0.5,-1.7
six,2013.0,Ha,4.0,0.1,2.1


In [98]:
df.loc['one', 'debt'] = [0.2]
df.loc['three', 'debt'] = [-1.1]

In [99]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014.0,Ki,1.5,0.1,0.2
two,2015.0,Ki,1.7,0.2,-1.2
three,2016.0,Ki,3.6,0.3,-1.1
four,2015.0,Ch,2.4,0.4,-1.5
five,2016.0,Ch,2.9,0.5,-1.7
six,2013.0,Ha,4.0,0.1,2.1


In [100]:
df.iloc[3]

Info
year       2015
names        Ch
points      2.4
penalty     0.4
debt       -1.5
Name: four, dtype: object

In [101]:
df.iloc[3:5, 0:2]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
four,2015.0,Ch
five,2016.0,Ch


In [104]:
df.iloc[[0, 1, 3], [1, 3]]

Info,names,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Ki,0.1
two,Ki,0.2
four,Ch,0.4


In [105]:
df.iloc[:, 1:4]

Info,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,Ki,1.5,0.1
two,Ki,1.7,0.2
three,Ki,3.6,0.3
four,Ch,2.4,0.4
five,Ch,2.9,0.5
six,Ha,4.0,0.1


In [106]:
df.iloc[1, 1]

'Ki'

In [107]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014.0,Ki,1.5,0.1,0.2
two,2015.0,Ki,1.7,0.2,-1.2
three,2016.0,Ki,3.6,0.3,-1.1
four,2015.0,Ch,2.4,0.4,-1.5
five,2016.0,Ch,2.9,0.5,-1.7
six,2013.0,Ha,4.0,0.1,2.1


In [114]:
df['year'] > 2014

Order
one      False
two       True
three     True
four      True
five      True
six      False
Name: year, dtype: bool

In [116]:
df.loc[df['year'] > 2014, :]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2015.0,Ki,1.7,0.2,-1.2
three,2016.0,Ki,3.6,0.3,-1.1
four,2015.0,Ch,2.4,0.4,-1.5
five,2016.0,Ch,2.9,0.5,-1.7


In [117]:
df.loc[df['names'] == 'Ki', ['names', 'points']]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Ki,1.5
two,Ki,1.7
three,Ki,3.6


In [119]:
df.loc[(df['points'] > 2) & (df['points'] < 3), :]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
four,2015.0,Ch,2.4,0.4,-1.5
five,2016.0,Ch,2.9,0.5,-1.7


In [120]:
df.loc[df['points'] > 3, 'penalty'] = 0

In [121]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014.0,Ki,1.5,0.1,0.2
two,2015.0,Ki,1.7,0.2,-1.2
three,2016.0,Ki,3.6,0.0,-1.1
four,2015.0,Ch,2.4,0.4,-1.5
five,2016.0,Ch,2.9,0.5,-1.7
six,2013.0,Ha,4.0,0.0,2.1
