Numpy는 수치연산, ndarray는 벡터화된 연산

Numpy에서의 핵심 객체가 ndarray
Pandas에서의 핵심 객체는 DataFrame과 Series

In [2]:
# pip install pandas -> pandas 설치 명령어

In [3]:
import pandas as pd
import numpy as np

In [4]:
# Display the help document (사용설명서)
pd?

In [5]:
# Print the version of pandas (버전확인)
pd.__version__

'1.1.3'

In [6]:
"""
    Pandas Object: Series
"""
# Create a series from an array

ser = pd.Series([0.25, 0.5, 0.75, 1.0]) 
# 왜 앞에가 대문자? 약간 특별하면 이렇게 표현 -> constructor method 
# Series는 pandas에서 제공하는 주요 자료구조로써 라이브러리안에 보면 Series라는 class 구현돼있음. 실제 프로그램에서 사용하려면 객체를 생성해야함. 객체 생성하는 메서드는 해당 class 이름으로(Sereies)
print(ser)
print(type(ser), '\n')

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
<class 'pandas.core.series.Series'> 



In [7]:
# Two main attributes: 'values' and 'index'

# value값 필요할 때
arr = ser.values
print(arr)

# index값 필요할 때
ind = ser.index
print(ind)

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [8]:
# Lable-based Indexing
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a','b','c','d'])
print(ser)
ind = ser.index
print(ind)
# pandas도 암묵적 인덱스 지원하지만 레이블 기반으로 인덱스 할 수 있기 때문에 명시적

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
Index(['a', 'b', 'c', 'd'], dtype='object')


In [9]:
"""
Dictionary and Series
"""

dict = {'a' : 1, 2 : 'two', 'third' : True}
print(dict)

{'a': 1, 2: 'two', 'third': True}


In [10]:
"""
Create a series from a dictionary
"""
# dictionary와 흡사
ser = pd.Series(dict)
print(ser)

population_dict = {'California': 38332521,
                   'Texas' : 26448193,
                   "New York": 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}

population = pd.Series(population_dict)
print(population)

# Dictionary와 달리 Series는 슬라이싱 같은 배열 연산 지원함
print(population['Texas' : 'Illinois'])


a           1
2         two
third    True
dtype: object
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
Texas       26448193
New York    19651127
Florida     19552860
Illinois    12882135
dtype: int64


In [11]:
"""
    Pandas object: DataFrame
"""
area_dict = {'California': 423967, 'Texas' : 695662, "New York": 141297, 'Florida': 170312, 'Illinois': 149995}

area = pd.Series(area_dict)
print(area)

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


In [12]:
# Construct a DataFrame containing 'population' and 'area' Series

states = pd.DataFrame({'population': population, 'area': area})
print(states)

print(states.index)
print(states.columns) # column 선택
print(states['population'])

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64


In [13]:
# Construt a DataFrame from a 2D NumPy array
arr = np.random.rand(3, 2)
print(arr, '\n')

pd.DataFrame(arr, columns = ['foo', 'bar'], index=['a','b','c'])


[[0.80222594 0.81302874]
 [0.87540879 0.94909441]
 [0.00597701 0.5607905 ]] 



Unnamed: 0,foo,bar
a,0.802226,0.813029
b,0.875409,0.949094
c,0.005977,0.560791


In [14]:
"""
Series objects manipulation: dictionary-style
"""

ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a','b','c','d'])

print(ser.keys())
print(ser['b'])
print('a' in ser)
print(0.25 in ser) # Key가 있는지 없는지만 알 수 있음. 0.25라는 key가 없기 때문에 False인 것

Index(['a', 'b', 'c', 'd'], dtype='object')
0.5
True
False


In [15]:
"""
Series object manipulation: array-style
"""

ser['e'] = 1.25
ser['a'] = 0.125
print(ser, '\n')

print(ser['a':'c'], '\n') # slicing -> 원래 slicing은 s

a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64 

a    0.125
b    0.500
c    0.750
dtype: float64 



In [26]:
"""
Caution: Slicing Series object using explicit/implicit indexing
"""

states['density'] = states['population'] / states['area'] # 변수 출력할 때 공백문자 그대로 쓰면 에러. 따옴표로 묶어주기
print(states)

            population    area     density
California    38332521  423967   90.413926
Texas         26448193  695662   38.018740
New York      19651127  141297  139.076746
Florida       19552860  170312  114.806121
Illinois      12882135  149995   85.883763


In [29]:
# Indexer: Loc, iloc

print(states)

# states['New York'] (X) -> states로 접근할 때는 column만 가능. 특정 데이터 접근 위해서는 loc붙여줘야함
states.loc['New York': 'Illinois'] 
# copy본이 아니기 때문에 원본 값이 바뀜

# loc는 명시적 인덱스 참조, iloc는 암묵적 인덱스 참조 -> 구별해서 사용

            population    area     density
California    38332521  423967   90.413926
Texas         26448193  695662   38.018740
New York      19651127  141297  139.076746
Florida       19552860  170312  114.806121
Illinois      12882135  149995   85.883763


Unnamed: 0,population,area,density
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [30]:
# Masking ans facy indexing usin the Loc indexer
print(states.loc[states.density > 100, ['population', 'density']], '\n')

          population     density
New York    19651127  139.076746
Florida     19552860  114.806121 

