## CHAPTER 2 Series 객체

### 2.1 Series의 개요 <hr>

In [1]:
import pandas as pd
import numpy as np

#### 2.1.1 클래스 및 인스턴스

In [2]:
pd.Series()

Series([], dtype: object)

#### 2.1.2 Series 값 채우기

In [10]:
# 두 시리즈의 결과는 동일
ice_cream_flavors = ['Chocolate', 'Vanilla', 'Strawberry', 'Rum Raisin']
print(pd.Series(ice_cream_flavors))
print()
print(pd.Series(data = ice_cream_flavors))

0     Chocolate
1       Vanilla
2    Strawberry
3    Rum Raisin
dtype: object

0     Chocolate
1       Vanilla
2    Strawberry
3    Rum Raisin
dtype: object


#### 2.1.3 Series 인덱스 사용자 정의

In [12]:
days_of_week = ('Monday', 'Wednesday', 'Friday', 'Saturday')
print(pd.Series(ice_cream_flavors, days_of_week))
print()
print(pd.Series(data = ice_cream_flavors, index = days_of_week))

Monday        Chocolate
Wednesday       Vanilla
Friday       Strawberry
Saturday     Rum Raisin
dtype: object

Monday        Chocolate
Wednesday       Vanilla
Friday       Strawberry
Saturday     Rum Raisin
dtype: object


In [13]:
# Series의 인덱스는 중복을 허용한다.

days_of_week = ('Monday', 'Wednesday', 'Friday', 'Wednesday')
print(pd.Series(ice_cream_flavors, days_of_week))
print()
print(pd.Series(data = ice_cream_flavors, index = days_of_week))

Monday        Chocolate
Wednesday       Vanilla
Friday       Strawberry
Wednesday    Rum Raisin
dtype: object

Monday        Chocolate
Wednesday       Vanilla
Friday       Strawberry
Wednesday    Rum Raisin
dtype: object


In [14]:
pd.Series(index = days_of_week, data = ice_cream_flavors)

Monday        Chocolate
Wednesday       Vanilla
Friday       Strawberry
Wednesday    Rum Raisin
dtype: object

In [19]:
bunch_of_bools = [True, False, False]
print(pd.Series(bunch_of_bools))

print()

stock_prices = [985.32, 950.44]
time_of_day = ['open', 'Close']
print(pd.Series(data = stock_prices, index = time_of_day))

print()

lucky_numbers = [4, 8, 12, 16, 23, 42]
print(pd.Series(lucky_numbers))


0     True
1    False
2    False
dtype: bool

open     985.32
Close    950.44
dtype: float64

0     4
1     8
2    12
3    16
4    23
5    42
dtype: int64


In [20]:
lucky_numbers = [4, 8, 15, 16, 23, 42]
pd.Series(lucky_numbers, dtype = 'float')

0     4.0
1     8.0
2    15.0
3    16.0
4    23.0
5    42.0
dtype: float64

#### 2.1.4 결측값이 있는 Series 생성

In [21]:
temperatures = [94, 88, np.nan, 91]
pd.Series(data = temperatures)

0    94.0
1    88.0
2     NaN
3    91.0
dtype: float64

### 2.2 파이썬 객체에서 Series 생성 <hr>

In [23]:
calorie_info = {'Cereal' : 125, 'Chocolate Bar' : 406, 'Ice Cream Sundae' : 342}
diet = pd.Series(calorie_info)
diet

Cereal              125
Chocolate Bar       406
Ice Cream Sundae    342
dtype: int64

In [24]:
pd.Series(data = ('Red', 'Green', 'Blue'))

0      Red
1    Green
2     Blue
dtype: object

In [25]:
rgb_colors = [(120, 41, 26), (196, 165, 45)]
pd.Series(data = rgb_colors)

0     (120, 41, 26)
1    (196, 165, 45)
dtype: object

In [26]:
# 집합에는 리스트와 같은 순서 개념이나 딕셔너리와 같은 연관개념이 없다.
# 따라서 판다스는 집합의 값을 저장하는 순서를 가정할 수 없다.

my_set = {'Ricky', 'Bobby'}
pd.Series(my_set)

TypeError: 'set' type is unordered

In [27]:
pd.Series(list(my_set))

0    Bobby
1    Ricky
dtype: object

In [30]:
random_data = np.random.randint(1, 101, 10)
print(random_data)
print(pd.Series(random_data))

[89 75 91 76 19  9 49 69 34 27]
0    89
1    75
2    91
3    76
4    19
5     9
6    49
7    69
8    34
9    27
dtype: int32


### 2.3 Series의 속성 <hr>

In [32]:
diet.values

array([125, 406, 342], dtype=int64)

In [33]:
type(diet.values)

numpy.ndarray

In [34]:
diet.index

Index(['Cereal', 'Chocolate Bar', 'Ice Cream Sundae'], dtype='object')

In [35]:
type(diet.index)

pandas.core.indexes.base.Index

In [37]:
diet.dtype

dtype('int64')

In [38]:
diet.size

3

In [39]:
diet.shape

(3,)

In [40]:
diet.is_unique

True

In [41]:
pd.Series(data = [3, 3]).is_unique

False

In [43]:
pd.Series(data = [1, 3, 5]).is_monotonic_increasing

True

In [44]:
pd.Series(data = [1, 6, 3]).is_monotonic_increasing

False

### 2.4 첫 번째행과 마지막 행 검색 <hr>

In [46]:
values = range(0, 500, 5)
nums = pd.Series(data = values)
nums

0       0
1       5
2      10
3      15
4      20
     ... 
95    475
96    480
97    485
98    490
99    495
Length: 100, dtype: int64

In [48]:
nums.head(3)

0     0
1     5
2    10
dtype: int64

In [49]:
nums.head(n = 3)

0     0
1     5
2    10
dtype: int64

In [50]:
# head() 메서드의 n 매겨변수는 기본 인수가 5이다.
nums.head()

0     0
1     5
2    10
3    15
4    20
dtype: int64

In [51]:
nums.tail(6)

94    470
95    475
96    480
97    485
98    490
99    495
dtype: int64

In [52]:
nums.tail()

95    475
96    480
97    485
98    490
99    495
dtype: int64

### 2.5 수학 연산 <hr>

#### 2.5.1 통계 연산

In [54]:
numbers = pd.Series([1, 2, 3, np.nan, 4, 5])
numbers

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    5.0
dtype: float64

In [55]:
numbers.count()

5

In [56]:
numbers.sum()

15.0

In [57]:
numbers.sum(skipna = False)

nan

In [58]:
numbers.sum(min_count = 3)

15.0

In [59]:
numbers.sum(min_count = 6)

nan

In [60]:
numbers.product()

120.0

In [61]:
numbers.product(skipna = False)

nan

In [62]:
numbers.product(min_count = 3)

120.0

In [65]:
print(numbers)
print()
print(numbers.cumsum())

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    5.0
dtype: float64

0     1.0
1     3.0
2     6.0
3     NaN
4    10.0
5    15.0
dtype: float64


In [66]:
numbers.cumsum(skipna = False)

0    1.0
1    3.0
2    6.0
3    NaN
4    NaN
5    NaN
dtype: float64

In [68]:
print(numbers)
print()
print(numbers.pct_change())

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    5.0
dtype: float64

0         NaN
1    1.000000
2    0.500000
3    0.000000
4    0.333333
5    0.250000
dtype: float64


  print(numbers.pct_change())


In [69]:
# 아래 세 줄은 결과가 동일하다.
numbers.pct_change()
numbers.pct_change(fill_method = 'pad')
numbers.pct_change(fill_method = 'ffill')

  numbers.pct_change()
  numbers.pct_change(fill_method = 'pad')
  numbers.pct_change(fill_method = 'ffill')


0         NaN
1    1.000000
2    0.500000
3    0.000000
4    0.333333
5    0.250000
dtype: float64

In [70]:
# 다음 두 줄은 결과가 동일하다.
numbers.pct_change(fill_method = 'bfill')
numbers.pct_change(fill_method = 'backfill')

  numbers.pct_change(fill_method = 'bfill')
  numbers.pct_change(fill_method = 'backfill')


0         NaN
1    1.000000
2    0.500000
3    0.333333
4    0.000000
5    0.250000
dtype: float64

In [71]:
numbers.mean()

3.0

In [72]:
numbers.median()

3.0

In [73]:
numbers.std()

1.5811388300841898

In [76]:
print(numbers.max())
print(numbers.min())

5.0
1.0


In [77]:
numbers.describe()

count    5.000000
mean     3.000000
std      1.581139
min      1.000000
25%      2.000000
50%      3.000000
75%      4.000000
max      5.000000
dtype: float64

In [78]:
numbers.sample(3)

4    4.0
3    NaN
5    5.0
dtype: float64

In [80]:
authors = pd.Series(['Hemingway', 'Orwell', 'Dostoevsky', 'Fitzgerald', 'Orwell'])
authors.unique()

array(['Hemingway', 'Orwell', 'Dostoevsky', 'Fitzgerald'], dtype=object)

In [81]:
authors.nunique()

4

#### 2.5.2 산술 연산 

In [83]:
s1 = pd.Series(data = [5, np.nan, 15], index = ['A', 'B', 'C'])
s1

A     5.0
B     NaN
C    15.0
dtype: float64

In [84]:
s1 + 3

A     8.0
B     NaN
C    18.0
dtype: float64

In [85]:
s1.add(3)

A     8.0
B     NaN
C    18.0
dtype: float64

In [89]:
# 아래 세 줄은 결과가 동일하다.
print(s1 - 5)
print()
print(s1.sub(5))
print()
print(s1.subtract(5))
print()

# 아래 세 줄은 결과가 동일하다.
print(s1 * 2)
print()
print(s1.mul(2))
print()
print(s1.multiply(2))
print()

# 아래 세 줄은 결과가 동일하다.
print(s1 / 2)
print()
print(s1.div(2))
print()
print(s1.divide(2))



A     0.0
B     NaN
C    10.0
dtype: float64

A     0.0
B     NaN
C    10.0
dtype: float64

A     0.0
B     NaN
C    10.0
dtype: float64

A    10.0
B     NaN
C    30.0
dtype: float64

A    10.0
B     NaN
C    30.0
dtype: float64

A    10.0
B     NaN
C    30.0
dtype: float64

A    2.5
B    NaN
C    7.5
dtype: float64

A    2.5
B    NaN
C    7.5
dtype: float64

A    2.5
B    NaN
C    7.5
dtype: float64


In [90]:
# 다음 두 줄은 결과가 동일하다.
print(s1 // 4)
print()
print(s1.floordiv(4))

A    1.0
B    NaN
C    3.0
dtype: float64
A    1.0
B    NaN
C    3.0
dtype: float64


In [91]:
# 다음 두 줄은 결과가 동일하다.
print(s1 % 3)
print(s1.mod(3))


A    2.0
B    NaN
C    0.0
dtype: float64
A    2.0
B    NaN
C    0.0
dtype: float64


#### 2.5.3 브로드캐스팅

In [92]:
s1 = pd.Series([1, 2, 3], index = ['A', 'B', 'C'])
s2 = pd.Series([4, 5, 6], index = ['A', 'B', 'C'])

In [96]:
s1 + s2

A    5
B    7
C    9
dtype: int64

In [97]:
s1 = pd.Series(data = [3, 6, np.nan, 12])
s2 = pd.Series(data = [2, 6, np.nan, 12])

In [98]:
# 다음 두 줄은 결과가 동일하다.
s1 == s2
s1.eq(s2)

0    False
1     True
2    False
3     True
dtype: bool

In [100]:
# 다음 두 줄은 결과가 동일하다.
s1 != s2
s1.ne(s2)

0     True
1    False
2     True
3    False
dtype: bool

In [101]:
s1 = pd.Series(data = [5, 10, 15], index = ['A', 'B', 'C'])
s2 = pd.Series(data = [4, 8, 12, 14], index = ['B', 'C', 'D', 'E'])

In [102]:
s1 + s2

A     NaN
B    14.0
C    23.0
D     NaN
E     NaN
dtype: float64

### 2.6 Series를 파이썬의 내장 함수에 전달 <hr>

In [127]:
cities = pd.Series(data = ['San Francisco', 'Los Angeles', 'Las Vegas', np.nan])

In [128]:
len(cities)

4

In [129]:
type(cities)

pandas.core.series.Series

In [130]:
dir(cities)

['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__bool__',
 '__class__',
 '__column_consortium_standard__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pandas_priority__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__

In [131]:
list(cities)

['San Francisco', 'Los Angeles', 'Las Vegas', nan]

In [132]:
dict(cities)

{0: 'San Francisco', 1: 'Los Angeles', 2: 'Las Vegas', 3: nan}

In [133]:
cities

0    San Francisco
1      Los Angeles
2        Las Vegas
3              NaN
dtype: object

In [134]:
# 다음은 Series 인덱스에서 검색

print('Las Vegas' in cities)
print(2 in cities)

False
True


In [135]:
'Las Vegas' in cities.values

True

In [126]:
print(100 not in cities)
print('Paris' not in cities.values)

True
True


### 2.7 코딩 챌린지 <hr>

In [8]:
superheroes = ['Batman',
               'Superaman',
               'Spider-Man',
               'Iron Man',
               'Captain America',
               'Wonder Woman']

strength_levels = (100, 120, 90, 95, 110, 120)

In [9]:
# 1. 슈퍼 히어로(superheroes) 리스트를 사용하여 새 Series 객체의 값을 채우세요.

pd.Series(superheroes)

0             Batman
1          Superaman
2         Spider-Man
3           Iron Man
4    Captain America
5       Wonder Woman
dtype: object

In [10]:
# 2. 능력치(strength_levels) 튜플을 사용하여 새 Series 객체의 값을 채우세요.

pd.Series(strength_levels)

0    100
1    120
2     90
3     95
4    110
5    120
dtype: int64

In [11]:
# 3. superheroes를 인덱스 레이블로 사용하고 strength_levels를 값으로 사용하여 Series를 생성하세요. heroes 변수에 Series를 할당하세요.
heroes = pd.Series(strength_levels, index = superheroes)

In [13]:
# 4. heroes Series의 처음 2개 행을 추출하세요.
heroes.head(2)

Batman       100
Superaman    120
dtype: int64

In [18]:
# 5. heroes Series의 마지막 4개의 행을 추출하세요.
heroes.tail(4)

Spider-Man          90
Iron Man            95
Captain America    110
Wonder Woman       120
dtype: int64

In [15]:
# 6. heroes Series에 있는 고유한 값의 개수를 구하세요.
heroes.nunique()

5

In [17]:
# 7. heroes에 있는 슈퍼 히어로의 평균 능력치를 구하세요.
heroes.mean()

105.83333333333333

In [21]:
# 8. heroes의 최대 및 최소 능력치를 구하세요.
print(f'max : {heroes.max()}\nmin : {heroes.min()}')

max : 120
min : 90


In [22]:
# 9. 능력치가 2배가 되면 각 슈퍼 히어로의 능력치는 얼마인지 구하세요.
heroes * 2

Batman             200
Superaman          240
Spider-Man         180
Iron Man           190
Captain America    220
Wonder Woman       240
dtype: int64

In [23]:
# 10. heroes Series를 파이썬 딕셔너리로 변환하세요.
dict(heroes)

{'Batman': 100,
 'Superaman': 120,
 'Spider-Man': 90,
 'Iron Man': 95,
 'Captain America': 110,
 'Wonder Woman': 120}

## DataFrame 객체

### 4.1 DataFrame의 개요 <hr>

In [129]:
import pandas as pd
import numpy as np

#### 4.1.1 딕셔너리에서 DataFrame 생성

In [130]:
city_data = {'City' : ['New York City', 'Paris', 'Barcelona', 'Rome'],
             'Country' : ['United States', 'France', 'Spain', 'Italy'],
             'Population' : pd.Series([8600000, 2141000, 5515000, 2873000])}
cities = pd.DataFrame(city_data)
cities

Unnamed: 0,City,Country,Population
0,New York City,United States,8600000
1,Paris,France,2141000
2,Barcelona,Spain,5515000
3,Rome,Italy,2873000


In [131]:
# 다음 두 줄은 결과가 동일하다.
cities.transpose()
cities.T

Unnamed: 0,0,1,2,3
City,New York City,Paris,Barcelona,Rome
Country,United States,France,Spain,Italy
Population,8600000,2141000,5515000,2873000


#### 4.1.2 넘파이 ndarray로 DataFrame 생성

In [132]:
random_data = np.random.randint(1, 101, [3, 5])
random_data

array([[ 89,  52, 100,  54,  84],
       [ 63,  84,  81,  12,  58],
       [ 94,  97,   1,  73,  56]])

In [133]:
pd.DataFrame(random_data)

Unnamed: 0,0,1,2,3,4
0,89,52,100,54,84
1,63,84,81,12,58
2,94,97,1,73,56


In [134]:
row_labels = ['Morning', 'Afternoon', 'Evening']
temperatures = pd.DataFrame(data = random_data, index = row_labels)
temperatures

Unnamed: 0,0,1,2,3,4
Morning,89,52,100,54,84
Afternoon,63,84,81,12,58
Evening,94,97,1,73,56


In [135]:
column_labels = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday')
pd.DataFrame(data = random_data, index = row_labels, columns = column_labels)

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday
Morning,89,52,100,54,84
Afternoon,63,84,81,12,58
Evening,94,97,1,73,56


In [136]:
# 판다스는 중복된 행과 열 인덱스를 허용한다.
row_labels = ['Morning', 'Afternoon', 'Morning']
column_labels = ['Monday', 'Tuesday', 'Wednesday', 'Tuesday', 'Friday']
pd.DataFrame(data = random_data,
             index = row_labels,
             columns = column_labels)

Unnamed: 0,Monday,Tuesday,Wednesday,Tuesday.1,Friday
Morning,89,52,100,54,84
Afternoon,63,84,81,12,58
Morning,94,97,1,73,56


### 4.2 Series와 DataFrame의 유사점 <hr>

#### 4.2.1 read_csv 함수로 DataFrame 가져오기

In [137]:
filename = '../DATA/nba.csv'
pd.read_csv(filename)

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357
2,PJ Washington,Charlotte Hornets,PF,8/23/98,3831840
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,1/4/99,4764960


In [138]:
pd.read_csv(filename, parse_dates = ['Birthday'])

  pd.read_csv(filename, parse_dates = ['Birthday'])


Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,1992-08-01,2174310
446,Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960


In [139]:
nba = pd.read_csv(filename, parse_dates = ['Birthday'])

  nba = pd.read_csv(filename, parse_dates = ['Birthday'])


#### 4.1.2 Series와 DataFrame 속성의 유사점과 차이점

In [140]:
# Series는 하나의 데이터 유형만 저장할 수 있다.

pd.Series([1, 2, 3]).dtype

dtype('int64')

In [141]:
# DataFrame은 여러 유형의 데이터를 담을 수 있다.
nba.dtypes

Name                object
Team                object
Position            object
Birthday    datetime64[ns]
Salary               int64
dtype: object

In [142]:
nba.dtypes.value_counts()

object            3
datetime64[ns]    1
int64             1
Name: count, dtype: int64

In [143]:
# index 속성은 DataFrame의 인덱스를 나타낸다.
nba.index

RangeIndex(start=0, stop=450, step=1)

In [144]:
nba.columns

Index(['Name', 'Team', 'Position', 'Birthday', 'Salary'], dtype='object')

In [145]:
nba.ndim

2

In [146]:
nba.shape

(450, 5)

In [147]:
# 데이터셋에 있는 값의 전채 개수를 반환한다. NaN과 같은 결측값도 개수에 포함된다.
nba.size

2250

In [148]:
# 결측값을 제외하고 싶다면 count 메서드를 사용한다.
nba.count()

Name        450
Team        450
Position    450
Birthday    450
Salary      450
dtype: int64

In [149]:
# sum 메서드를 사용하여 Series에 있는 결측값이 아니 모든 값의 개수를 더할 수 있다.
# nba DataFrame 데이터셋은 결측값이 없기 때문에 size 속성과 sum 메서드가 동일한 결과를 반환한다.
nba.count().sum()

2250

In [150]:
data = {'A' : [1, np.nan], 'B' : [2, 3]}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1.0,2
1,,3


In [151]:
df.size

4

In [152]:
print(df.count())
print()
print(df.count().sum())

A    1
B    2
dtype: int64

3


#### 4.2.3 Series와 DataFrame의 공통 메서드

In [153]:
nba.head(2)

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357


In [154]:
nba.tail(n = 3)

Unnamed: 0,Name,Team,Position,Birthday,Salary
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
449,Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000


In [155]:
# 위의 두 메서드는 주어진 인수가 없으면 기본적으로 5개의 행을 반환한다.
nba.tail()

Unnamed: 0,Name,Team,Position,Birthday,Salary
445,Austin Rivers,Houston Rockets,PG,1992-08-01,2174310
446,Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
449,Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000


In [156]:
# sample 메서드는 DataFrame에서 임의의 행을 추출한다.
nba.sample(3)

Unnamed: 0,Name,Team,Position,Birthday,Salary
16,Dwight Howard,Los Angeles Lakers,C,1985-12-08,5603850
67,Jarrett Culver,Minnesota Timberwolves,SG,1999-02-20,5813640
414,Sterling Brown,Milwaukee Bucks,SF,1995-02-10,1618520


In [157]:
nba.nunique()

Name        450
Team         30
Position      9
Birthday    430
Salary      269
dtype: int64

In [158]:
nba.max()

Name             Zylan Cheatham
Team         Washington Wizards
Position                     SG
Birthday    2000-12-23 00:00:00
Salary                 40231758
dtype: object

In [159]:
nba.min()

Name               Aaron Gordon
Team              Atlanta Hawks
Position                      C
Birthday    1977-01-26 00:00:00
Salary                    79568
dtype: object

In [160]:
nba.nlargest(n = 4, columns = 'Salary')

Unnamed: 0,Name,Team,Position,Birthday,Salary
205,Stephen Curry,Golden State Warriors,PG,1988-03-14,40231758
38,Chris Paul,Oklahoma City Thunder,PG,1985-05-06,38506482
219,Russell Westbrook,Houston Rockets,PG,1988-11-12,38506482
251,John Wall,Washington Wizards,PG,1990-09-06,38199000


In [161]:
nba.nsmallest(n = 3, columns = ['Birthday'])

Unnamed: 0,Name,Team,Position,Birthday,Salary
98,Vince Carter,Atlanta Hawks,PF,1977-01-26,2564753
196,Udonis Haslem,Miami Heat,C,1980-06-09,2564753
262,Kyle Korver,Milwaukee Bucks,PF,1981-03-17,6004753


In [162]:
nba.sum()

TypeError: 'DatetimeArray' with dtype datetime64[ns] does not support reduction 'sum'

In [163]:
nba1 = nba.drop(['Birthday'], axis = 1)
nba1

Unnamed: 0,Name,Team,Position,Salary
0,Shake Milton,Philadelphia 76ers,SG,1445697
1,Christian Wood,Detroit Pistons,PF,1645357
2,PJ Washington,Charlotte Hornets,PF,3831840
3,Derrick Rose,Detroit Pistons,PG,7317074
4,Marial Shayok,Philadelphia 76ers,G,79568
...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,2174310
446,Harry Giles,Sacramento Kings,PF,2578800
447,Robin Lopez,Milwaukee Bucks,C,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,4764960


In [164]:
nba1.sum()

Name        Shake MiltonChristian WoodPJ WashingtonDerrick...
Team        Philadelphia 76ersDetroit PistonsCharlotte Hor...
Position    SGPFPFPGGPFSGSFCSFPGPGFCPGSGPFCCPFPFSGPFPGSGSF...
Salary                                             3444112694
dtype: object

In [165]:
nba1.sum(numeric_only = True)

Salary    3444112694
dtype: int64

In [166]:
nba1.mean(numeric_only = True)

Salary    7.653584e+06
dtype: float64

In [167]:
print(nba1.median(numeric_only = True))
print()
print(nba1.mode(numeric_only = True))
print()
print(nba1.std(numeric_only = True))

Salary    3303074.5
dtype: float64

   Salary
0   79568

Salary    9.288810e+06
dtype: float64


### 4.3 DataFrame 정렬

#### 4.3.1 단일 열 기준으로 정렬 <hr>

In [168]:
# 다음 두 줄은 결과가 동일하다.
nba.sort_values('Name')
nba.sort_values(by = 'Name')

Unnamed: 0,Name,Team,Position,Birthday,Salary
52,Aaron Gordon,Orlando Magic,PF,1995-09-16,19863636
101,Aaron Holiday,Indiana Pacers,PG,1996-09-30,2239200
437,Abdel Nader,Oklahoma City Thunder,SF,1993-09-25,1618520
81,Adam Mokoka,Chicago Bulls,G,1998-07-18,79568
399,Admiral Schofield,Washington Wizards,SF,1997-03-30,1000000
...,...,...,...,...,...
159,Zach LaVine,Chicago Bulls,PG,1995-03-10,19500000
302,Zach Norvell,Los Angeles Lakers,SG,1997-12-09,79568
312,Zhaire Smith,Philadelphia 76ers,SG,1999-06-04,3058800
137,Zion Williamson,New Orleans Pelicans,F,2000-07-06,9757440


In [169]:
nba.sort_values('Name', ascending = False).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
248,Zylan Cheatham,New Orleans Pelicans,SF,1995-11-17,79568
137,Zion Williamson,New Orleans Pelicans,F,2000-07-06,9757440
312,Zhaire Smith,Philadelphia 76ers,SG,1999-06-04,3058800
302,Zach Norvell,Los Angeles Lakers,SG,1997-12-09,79568
159,Zach LaVine,Chicago Bulls,PG,1995-03-10,19500000


In [170]:
nba.sort_values('Birthday', ascending = False).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
136,Sekou Doumbouya,Detroit Pistons,SF,2000-12-23,3285120
432,Talen Horton-Tucker,Los Angeles Lakers,GF,2000-11-25,898310
137,Zion Williamson,New Orleans Pelicans,F,2000-07-06,9757440
313,RJ Barrett,New York Knicks,SG,2000-06-14,7839960
392,Jalen Lecque,Phoenix Suns,G,2000-06-13,898310


#### 4.3.2 다중 열 기준으로 정렬

In [171]:
# 판다스는 기본적으로 모든 열을 오름차순으로 정렬한다.

nba.sort_values(by = ['Team', 'Name'])

Unnamed: 0,Name,Team,Position,Birthday,Salary
359,Alex Len,Atlanta Hawks,C,1993-06-16,4160000
167,Allen Crabbe,Atlanta Hawks,SG,1992-04-09,18500000
276,Brandon Goodwin,Atlanta Hawks,PG,1995-10-02,79568
438,Bruno Fernando,Atlanta Hawks,C,1998-08-15,1400000
194,Cam Reddish,Atlanta Hawks,SF,1999-09-01,4245720
...,...,...,...,...,...
418,Jordan McRae,Washington Wizards,PG,1991-03-28,1645357
273,Justin Robinson,Washington Wizards,PG,1997-10-12,898310
428,Moritz Wagner,Washington Wizards,C,1997-04-26,2063520
21,Rui Hachimura,Washington Wizards,PF,1998-02-08,4469160


In [172]:
nba.sort_values(['Team', 'Name'], ascending = False)

Unnamed: 0,Name,Team,Position,Birthday,Salary
36,Thomas Bryant,Washington Wizards,C,1997-07-31,8000000
21,Rui Hachimura,Washington Wizards,PF,1998-02-08,4469160
428,Moritz Wagner,Washington Wizards,C,1997-04-26,2063520
273,Justin Robinson,Washington Wizards,PG,1997-10-12,898310
418,Jordan McRae,Washington Wizards,PG,1991-03-28,1645357
...,...,...,...,...,...
194,Cam Reddish,Atlanta Hawks,SF,1999-09-01,4245720
438,Bruno Fernando,Atlanta Hawks,C,1998-08-15,1400000
276,Brandon Goodwin,Atlanta Hawks,PG,1995-10-02,79568
167,Allen Crabbe,Atlanta Hawks,SG,1992-04-09,18500000


In [173]:
nba.sort_values(by = ['Team', 'Salary'], ascending = [True, False])

Unnamed: 0,Name,Team,Position,Birthday,Salary
111,Chandler Parsons,Atlanta Hawks,SF,1988-10-25,25102512
28,Evan Turner,Atlanta Hawks,PG,1988-10-27,18606556
167,Allen Crabbe,Atlanta Hawks,SG,1992-04-09,18500000
213,De'Andre Hunter,Atlanta Hawks,SF,1997-12-02,7068360
339,Jabari Parker,Atlanta Hawks,PF,1995-03-15,6500000
...,...,...,...,...,...
80,Isaac Bonga,Washington Wizards,PG,1999-11-08,1416852
399,Admiral Schofield,Washington Wizards,SF,1997-03-30,1000000
273,Justin Robinson,Washington Wizards,PG,1997-10-12,898310
283,Garrison Mathews,Washington Wizards,SG,1996-10-24,79568


In [174]:
nba= nba.sort_values(by = ['Team', 'Salary'], ascending = [True, False])

In [175]:
nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
111,Chandler Parsons,Atlanta Hawks,SF,1988-10-25,25102512
28,Evan Turner,Atlanta Hawks,PG,1988-10-27,18606556
167,Allen Crabbe,Atlanta Hawks,SG,1992-04-09,18500000
213,De'Andre Hunter,Atlanta Hawks,SF,1997-12-02,7068360
339,Jabari Parker,Atlanta Hawks,PF,1995-03-15,6500000


In [176]:
# 다음 두줄은 결과가 동일하다.
# nba DataFrame에는 여전히 숫자 인덱스가 있다. 
# 열 값이 아닌 인덱스 위치를 기준으로 데이터셋을 정렬할 수 있다면 처음 가져온 상태로 되돌릴 수 있다.
nba.sort_index().head()
nba.sort_index(ascending = True).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [177]:
nba.sort_index(ascending = False).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
449,Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
446,Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
445,Austin Rivers,Houston Rockets,PG,1992-08-01,2174310


In [178]:
# 인덱스 위치를 기준으로 DataFrame을 정렬하여 처음 가져온 상태로 되돌린다.
nba = nba.sort_index()

#### 4.4.2 열 인덱스 기준으로 정렬

In [179]:
# 다음 두 줄은 결과가 동일하다.
nba.sort_index(axis = 'columns').head()
nba.sort_index(axis = 1).head()

Unnamed: 0,Birthday,Name,Position,Salary,Team
0,1996-09-26,Shake Milton,SG,1445697,Philadelphia 76ers
1,1995-09-27,Christian Wood,PF,1645357,Detroit Pistons
2,1998-08-23,PJ Washington,PF,3831840,Charlotte Hornets
3,1988-10-04,Derrick Rose,PG,7317074,Detroit Pistons
4,1995-07-26,Marial Shayok,G,79568,Philadelphia 76ers


In [180]:
nba.sort_index(axis = 'columns', ascending = False).head()

Unnamed: 0,Team,Salary,Position,Name,Birthday
0,Philadelphia 76ers,1445697,SG,Shake Milton,1996-09-26
1,Detroit Pistons,1645357,PF,Christian Wood,1995-09-27
2,Charlotte Hornets,3831840,PF,PJ Washington,1998-08-23
3,Detroit Pistons,7317074,PG,Derrick Rose,1988-10-04
4,Philadelphia 76ers,79568,G,Marial Shayok,1995-07-26


### 4.5 새 인덱스 설정 <hr>

In [181]:
## 다음 두 줄은 결과가 동일하다.
nba.set_index(keys = 'Name')
nba.set_index('Name')

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568
...,...,...,...,...
Austin Rivers,Houston Rockets,PG,1992-08-01,2174310
Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960


In [182]:
nba = nba.set_index(keys = 'Name')

In [183]:
# 다음 코드는 이전 예제와 동일한 DataFrame을 생성한다.
nba = pd.read_csv(filename, parse_dates = ['Birthday'], index_col = 'Name')

  nba = pd.read_csv(filename, parse_dates = ['Birthday'], index_col = 'Name')


### 4.6 DataFrame에서 열과 행 선택

#### 4.6.1 DataFrame에서 단일 열 선택

In [184]:
nba.Salary

Name
Shake Milton       1445697
Christian Wood     1645357
PJ Washington      3831840
Derrick Rose       7317074
Marial Shayok        79568
                    ...   
Austin Rivers      2174310
Harry Giles        2578800
Robin Lopez        4767000
Collin Sexton      4764960
Ricky Rubio       16200000
Name: Salary, Length: 450, dtype: int64

In [185]:
nba['Position']

Name
Shake Milton      SG
Christian Wood    PF
PJ Washington     PF
Derrick Rose      PG
Marial Shayok      G
                  ..
Austin Rivers     PG
Harry Giles       PF
Robin Lopez        C
Collin Sexton     PG
Ricky Rubio       PG
Name: Position, Length: 450, dtype: object

In [186]:
# 대괄호 구문의 장점은 공백이 있는 열 이름을 지원한다는 것이다.
# nba['Player Position']
# nba.player Position ==> 실행 안 됨.

#### 4.6.2 DataFrame에서 다중 열 선택

In [187]:
nba[['Salary', 'Birthday']].head()

Unnamed: 0_level_0,Salary,Birthday
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Shake Milton,1445697,1996-09-26
Christian Wood,1645357,1995-09-27
PJ Washington,3831840,1998-08-23
Derrick Rose,7317074,1988-10-04
Marial Shayok,79568,1995-07-26


In [188]:
# 판다스는 리스트의 순서에 따라 열을 추출한다.
nba[['Birthday', 'Salary']].head()

Unnamed: 0_level_0,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Shake Milton,1996-09-26,1445697
Christian Wood,1995-09-27,1645357
PJ Washington,1998-08-23,3831840
Derrick Rose,1988-10-04,7317074
Marial Shayok,1995-07-26,79568


In [189]:
nba.select_dtypes(include = 'object')

Unnamed: 0_level_0,Team,Position
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Shake Milton,Philadelphia 76ers,SG
Christian Wood,Detroit Pistons,PF
PJ Washington,Charlotte Hornets,PF
Derrick Rose,Detroit Pistons,PG
Marial Shayok,Philadelphia 76ers,G
...,...,...
Austin Rivers,Houston Rockets,PG
Harry Giles,Sacramento Kings,PF
Robin Lopez,Milwaukee Bucks,C
Collin Sexton,Cleveland Cavaliers,PG


In [190]:
nba.select_dtypes(exclude = ['object', 'int'])

Unnamed: 0_level_0,Birthday
Name,Unnamed: 1_level_1
Shake Milton,1996-09-26
Christian Wood,1995-09-27
PJ Washington,1998-08-23
Derrick Rose,1988-10-04
Marial Shayok,1995-07-26
...,...
Austin Rivers,1992-08-01
Harry Giles,1998-04-22
Robin Lopez,1988-04-01
Collin Sexton,1999-01-04


In [191]:
nba.select_dtypes(exclude = ['datetime'])

Unnamed: 0_level_0,Team,Position,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Shake Milton,Philadelphia 76ers,SG,1445697
Christian Wood,Detroit Pistons,PF,1645357
PJ Washington,Charlotte Hornets,PF,3831840
Derrick Rose,Detroit Pistons,PG,7317074
Marial Shayok,Philadelphia 76ers,G,79568
...,...,...,...
Austin Rivers,Houston Rockets,PG,2174310
Harry Giles,Sacramento Kings,PF,2578800
Robin Lopez,Milwaukee Bucks,C,4767000
Collin Sexton,Cleveland Cavaliers,PG,4764960


### DataFrame에서 행 선택

#### 4.7.1 인덱스 레이블로 행 추출

In [192]:
nba.loc['LeBron James']

Team         Los Angeles Lakers
Position                     PF
Birthday    1984-12-30 00:00:00
Salary                 37436858
Name: LeBron James, dtype: object

In [193]:
# 여러 행을 추출하려면 대괄호 사이에 리스트를 전달한다.
nba.loc[['Kawhi Leonard', 'Paul George']]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Kawhi Leonard,Los Angeles Clippers,SF,1991-06-29,32742000
Paul George,Los Angeles Clippers,SF,1990-05-02,33005556


In [194]:
nba.loc[['Paul George', 'Kawhi Leonard']]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Paul George,Los Angeles Clippers,SF,1990-05-02,33005556
Kawhi Leonard,Los Angeles Clippers,SF,1991-06-29,32742000


In [195]:
# loc을 사용하여 인덱스 레이블 시퀀스를 추출하려면 먼저 인덱스를 정렬하는 것이 좋다.
# 왜냐하면 판다스가 값을 찾는 속도를 가속화하기 때문이다.
nba.sort_index().loc['Otto Porter' : 'Patrick Beverley']

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Otto Porter,Chicago Bulls,SF,1993-06-03,27250576
PJ Dozier,Denver Nuggets,PG,1996-10-25,79568
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Pascal Siakam,Toronto Raptors,PF,1994-04-02,2351838
Pat Connaughton,Milwaukee Bucks,SG,1993-01-06,1723050
Patrick Beverley,Los Angeles Clippers,PG,1988-07-12,12345680


In [196]:
players = ['Otto Porter', 'pJ Dozier', 'PJ Washington']
players[:2]

['Otto Porter', 'pJ Dozier']

In [197]:
nba.sort_index().loc['Zach Collins':]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Zach Collins,Portland Trail Blazers,C,1997-11-19,4240200
Zach LaVine,Chicago Bulls,PG,1995-03-10,19500000
Zach Norvell,Los Angeles Lakers,SG,1997-12-09,79568
Zhaire Smith,Philadelphia 76ers,SG,1999-06-04,3058800
Zion Williamson,New Orleans Pelicans,F,2000-07-06,9757440
Zylan Cheatham,New Orleans Pelicans,SF,1995-11-17,79568


In [198]:
nba.sort_index().loc[:'Al Horford']

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaron Gordon,Orlando Magic,PF,1995-09-16,19863636
Aaron Holiday,Indiana Pacers,PG,1996-09-30,2239200
Abdel Nader,Oklahoma City Thunder,SF,1993-09-25,1618520
Adam Mokoka,Chicago Bulls,G,1998-07-18,79568
Admiral Schofield,Washington Wizards,SF,1997-03-30,1000000
Al Horford,Philadelphia 76ers,C,1986-06-03,28000000


In [199]:
# 판다스는 인덱스 레이블이 DataFrame에 존재 하지 않으면 예외를 발생시킨다.
nba.loc['Bugs Bunny']

KeyError: 'Bugs Bunny'

#### 4.7.2 인덱스 위치로 행 추출

In [200]:
nba.iloc[300]

Team             Denver Nuggets
Position                     PF
Birthday    1999-04-03 00:00:00
Salary                  1416852
Name: Jarred Vanderbilt, dtype: object

In [201]:
nba.iloc[[100, 200, 300, 400]]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Brian Bowen,Indiana Pacers,SG,1998-10-02,79568
Marco Belinelli,San Antonio Spurs,SF,1986-03-25,5846154
Jarred Vanderbilt,Denver Nuggets,PF,1999-04-03,1416852
Louis King,Detroit Pistons,F,1999-04-06,79568


In [202]:
# 인덱스 404에 있는 행은 제외한다.
nba.iloc[400:404]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Louis King,Detroit Pistons,F,1999-04-06,79568
Kostas Antetokounmpo,Los Angeles Lakers,PF,1997-11-20,79568
Rodions Kurucs,Brooklyn Nets,PF,1998-02-05,1699236
Spencer Dinwiddie,Brooklyn Nets,PG,1993-04-06,10605600


In [203]:
nba.iloc[:2]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
Christian Wood,Detroit Pistons,PF,1995-09-27,1645357


In [204]:
nba.iloc[447:]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000


In [205]:
nba.iloc[-10:-6]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jared Dudley,Los Angeles Lakers,PF,1985-07-10,2564753
Max Strus,Chicago Bulls,SG,1996-03-28,79568
Kevon Looney,Golden State Warriors,C,1996-02-06,4464286
Willy Hernangomez,Charlotte Hornets,C,1994-05-27,1557250


In [206]:
nba.iloc[0:10:2]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568
Kendrick Nunn,Miami Heat,SG,1995-08-03,1416852
Brook Lopez,Milwaukee Bucks,C,1988-04-01,12093024


#### 4.7.3 특정 열에서 값 추출

In [207]:
nba.loc['Giannis Antetokounmpo', 'Team']

'Milwaukee Bucks'

In [208]:
# 판다스는 Series를 반환한다.
nba.loc['James Harden', ['Position', 'Birthday']]

Position                     PG
Birthday    1989-08-26 00:00:00
Name: James Harden, dtype: object

In [209]:
nba.loc[['Russell Westbrook', 'Anthony Davis'], ['Team', 'Salary']]

Unnamed: 0_level_0,Team,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Russell Westbrook,Houston Rockets,38506482
Anthony Davis,Los Angeles Lakers,27093019


In [210]:
nba.loc['Joel Embiid', 'Position' : 'Salary']

Position                      C
Birthday    1994-03-16 00:00:00
Salary                 27504630
Name: Joel Embiid, dtype: object

In [211]:
# DataFrame에서 보여지는순서에 맞춰 열의 이름을 전달해야 한다.
# 그렇지 않으면 빈 결과 반환
nba.loc['Joel Embiid', 'Salary' : 'Position']

Series([], Name: Joel Embiid, dtype: object)

In [212]:
nba.iloc[57, 3]

796806

In [213]:
nba.iloc[100:104, :3]

Unnamed: 0_level_0,Team,Position,Birthday
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brian Bowen,Indiana Pacers,SG,1998-10-02
Aaron Holiday,Indiana Pacers,PG,1996-09-30
Troy Daniels,Los Angeles Lakers,SG,1991-07-15
Buddy Hield,Sacramento Kings,SG,1992-12-17


In [214]:
# 판다스는 단일 값을 찾을 때 검색 알고리즘을 최적화할 수 있기 때문에 at과 iat가 loc나 iloc보다 빠르다.
# at 속성은 행 및 열 레이블을 허용한다.
nba.at['Austin Rivers', 'Birthday']

Timestamp('1992-08-01 00:00:00')

In [215]:
# iat 속성은 행과 열 인덱스를 허용한다.
nba.iat[263, 1]

'PF'

In [216]:
# %%timeit은 셀에서 코드를 실행하고 실행하는데 걸리는 평균 시간을 계산하는 메서드
%%timeit
nba.at['Austin Rivers', 'Birthday']

UsageError: Line magic function `%%timeit` not found.


In [217]:
%%timeit
nba.loc['Austin Rivers', 'Birthday']

16.4 µs ± 317 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [218]:
%%timeit
nba.iat[263, 1]

18.9 µs ± 1.76 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [219]:
%%timeit
nba.iloc[263, 1]

24 µs ± 492 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### 4.8 Series에서 값 추출 <hr>

In [220]:
print(nba['Salary'].loc['Damian Lillard'])
print(nba['Salary'].at['Damian Lillard'])
print(nba['Salary'].iloc[234])
print(nba['Salary'].iat[234])

29802321
29802321
2033160
2033160


### 4.9 열 또는 행 이름 바꾸기 <hr>

In [221]:
nba.columns

Index(['Team', 'Position', 'Birthday', 'Salary'], dtype='object')

In [222]:
# 열 이름 수정
nba.columns = ['Team', 'Position', 'Date of Birth', 'Pay']
nba.head(1)

Unnamed: 0_level_0,Team,Position,Date of Birth,Pay
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697


In [223]:
# rename 메서드도 열의 이름을 변경할 수 있다.
# 키가 기존 열의 이름이고 값이 새 이름인 딕셔너리를 columns 매개변수에 전달한다.
nba.rename(columns = {'Date of Birth' : 'Birthday'})

Unnamed: 0_level_0,Team,Position,Birthday,Pay
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568
...,...,...,...,...
Austin Rivers,Houston Rockets,PG,1992-08-01,2174310
Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960


In [224]:
nba.loc['Giannis Antetokounmpo']

Team                 Milwaukee Bucks
Position                          PF
Date of Birth    1994-12-06 00:00:00
Pay                         25842697
Name: Giannis Antetokounmpo, dtype: object

In [225]:
# 메서드의 index 매개변수에 딕셔너리를 전달하여 인덱스 레이블의 이름을 바꿀수도 있다.
nba = nba.rename(index = {'Giannis Antetokounmpo' : 'Greek Freak'})

In [228]:
nba.loc['Greek Freak']

Team                 Milwaukee Bucks
Position                          PF
Date of Birth    1994-12-06 00:00:00
Pay                         25842697
Name: Greek Freak, dtype: object

### 4.10 인덱스 재설정 

In [229]:
nba.set_index('Team').head()

Unnamed: 0_level_0,Position,Date of Birth,Pay
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Philadelphia 76ers,SG,1996-09-26,1445697
Detroit Pistons,PF,1995-09-27,1645357
Charlotte Hornets,PF,1998-08-23,3831840
Detroit Pistons,PG,1988-10-04,7317074
Philadelphia 76ers,G,1995-07-26,79568


In [230]:
# reset_index 메서드는 현재 인덱스를 DataFrame 열로 이동하고 인덱스를 판다스의 숫자 인덱스로 변경한다.
nba.reset_index().head()

Unnamed: 0,Name,Team,Position,Date of Birth,Pay
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [231]:
# set_index를 메서드를 이용하여 데이터 손실 없이 Team 열을 인덱스로 이동할 수 있다.
nba.reset_index().set_index('Team').head()

Unnamed: 0_level_0,Name,Position,Date of Birth,Pay
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Philadelphia 76ers,Shake Milton,SG,1996-09-26,1445697
Detroit Pistons,Christian Wood,PF,1995-09-27,1645357
Charlotte Hornets,PJ Washington,PF,1998-08-23,3831840
Detroit Pistons,Derrick Rose,PG,1988-10-04,7317074
Philadelphia 76ers,Marial Shayok,G,1995-07-26,79568


In [232]:
# inplace 매개변수를 사용하지 않았을 때의 한 가지 이점은 여러 메서드를 연쇄적으로 호출할 수 있다는 것이다.
nba = nba.reset_index().set_index('Team')

### 4.11 코딩 챌린지 <hr>

In [1]:
filename = '../DATA/nfl.csv'

In [5]:
# 1. nfl.csv 파일을가져오려면 어떻게 해야 할까요? Birthday 열의 값을 날짜/시간으로 변환하는 효과적인 방법은 없을까요?

nfl = pd.read_csv(filename, parse_dates = ['Birthday'])
nfl

  nfl = pd.read_csv(filename, parse_dates = ['Birthday'])


Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,1992-08-01,2174310
446,Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960


In [30]:
# 2. 선수의 이름을 DataFrame의 인덱스로 지정하는 두 가지 방법은 무엇인가요?

nfl2 = nfl.set_index('Name')
nfl3 = pd.read_csv(filename, index_col = 'Name', parse_dates = ['Birthday'])
print(nfl2, nfl3, sep ='\n\n')

                                    Team Position   Birthday   Salary
Name                                                                 
Tremon Smith         Philadelphia Eagles       RB 1996-07-20   570000
Shawn Williams        Cincinnati Bengals       SS 1991-05-13  3500000
Adam Butler         New England Patriots       DT 1994-04-12   645000
Derek Wolfe               Denver Broncos       DE 1990-02-24  8000000
Jake Ryan           Jacksonville Jaguars      OLB 1992-02-27  1000000
...                                  ...      ...        ...      ...
Bashaud Breeland      Kansas City Chiefs       CB 1992-01-30   805000
Craig James          Philadelphia Eagles       CB 1996-04-29   570000
Jonotthan Harrison         New York Jets        C 1991-08-25  1500000
Chuma Edoga                New York Jets       OT 1997-05-25   495000
Tajae Sharpe            Tennessee Titans       WR 1994-12-23  2025000

[1655 rows x 4 columns]

                                    Team Position   Birthday   S

In [33]:
# 3. 이 데이터셋에서 팀 당 선수가 몇 명인지 계산하는 방법은 무엇인가요?
nfl.Team.value_counts()  # == nfl.Team.value_counts()

Team
New York Jets           58
Kansas City Chiefs      56
Washington Redskins     56
New Orleans Saints      55
San Francisco 49Ers     55
Denver Broncos          54
Minnesota Vikings       54
Los Angeles Chargers    54
Seattle Seahawks        53
Dallas Cowboys          53
Buffalo Bills           53
Atlanta Falcons         53
Detroit Lions           53
Chicago Bears           53
Los Angeles Rams        52
New York Giants         52
Philadelphia Eagles     52
Houston Texans          52
Arizona Cardinals       51
Cincinnati Bengals      51
Green Bay Packers       51
Oakland Raiders         51
Jacksonville Jaguars    50
Cleveland Browns        49
Miami Dolphins          49
Indianapolis Colts      49
Carolina Panthers       49
New England Patriots    49
Baltimore Ravens        48
Pittsburgh Steelers     47
Tampa Bay Buccaneers    47
Tennessee Titans        46
Name: count, dtype: int64

In [34]:
# 4. 가장 높은 연봉을 받는 5명의 선수는 누구인가요?
nfl.sort_values(by = 'Salary', ascending = False).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
180,Kirk Cousins,Minnesota Vikings,QB,1988-08-19,27500000
1623,Jameis Winston,Tampa Bay Buccaneers,QB,1994-01-06,20922000
905,Marcus Mariota,Tennessee Titans,QB,1993-10-30,20922000
1343,Derek Carr,Oakland Raiders,QB,1991-03-28,19900000
150,Jimmy Garoppolo,San Francisco 49Ers,QB,1991-11-02,17200000


In [36]:
# 5. 데이터셋에서 먼저 팀을 알파벳 순서로 정렬한 다음 연봉을 내림차순으로 정렬하려면 어떻게 해야 하나요?
nfl.sort_values(by = ['Team', 'Salary'], ascending = [True, False])

Unnamed: 0,Name,Team,Position,Birthday,Salary
1577,Chandler Jones,Arizona Cardinals,OLB,1990-02-27,16500000
669,Patrick Peterson,Arizona Cardinals,CB,1990-07-11,11000000
1361,Larry Fitzgerald,Arizona Cardinals,WR,1983-08-31,11000000
1594,David Johnson,Arizona Cardinals,RB,1991-12-16,5700000
948,Justin Pugh,Arizona Cardinals,G,1990-08-15,5000000
...,...,...,...,...,...
1109,Ross Pierschbacher,Washington Redskins,C,1995-05-05,495000
1325,Kelvin Harmon,Washington Redskins,WR,1996-12-15,495000
1344,Wes Martin,Washington Redskins,G,1996-05-09,495000
1345,Jimmy Moreland,Washington Redskins,CB,1995-08-26,495000


In [55]:
nfl[nfl['Team'] == 'Washington Redskins'].sort_values(by = 'Salary', ascending = False)

Unnamed: 0,Name,Team,Position,Birthday,Salary
961,Brandon Scherff,Washington Redskins,G,1991-12-26,12525000
431,Josh Norman,Washington Redskins,CB,1987-12-15,11000000
15,Ryan Kerrigan,Washington Redskins,OLB,1988-08-16,10500000
154,Aaron Colvin,Washington Redskins,CB,1991-10-02,7500000
1025,Morgan Moses,Washington Redskins,OT,1991-03-03,4750000
774,Vernon Davis,Washington Redskins,TE,1984-01-31,4750000
1634,Case Keenum,Washington Redskins,QB,1988-02-17,3500000
1308,Quinton Dunbar,Washington Redskins,CB,1992-07-22,3000000
610,Colt McCoy,Washington Redskins,QB,1986-09-05,3000000
183,Chris Thompson,Washington Redskins,RB,1990-10-20,2750000


In [50]:
# 6. New York Jets 팀에서가장 나이가 많은 선수는 누구이며 그의 생일은 언제인가요?
nfl[nfl['Team'] == 'New York Jets'].sort_values('Birthday').head(1)

Unnamed: 0,Name,Team,Position,Birthday,Salary
888,Ryan Kalil,New York Jets,C,1985-03-29,2400000


In [53]:
nfl['Team'] == 'New York Jets'

0       False
1       False
2       False
3       False
4       False
        ...  
1650    False
1651    False
1652     True
1653     True
1654    False
Name: Team, Length: 1655, dtype: bool

In [54]:
type(nfl['Team'] == 'New York Jets')

pandas.core.series.Series