# 판다스(Pandas) 사용법
판다스는 파이썬에서 사용하는 데이터 분석 라이브러리.

 행과 열로 이루어진 데이터 객체를 만들어 다룰수 있음.
 
 보다 안정적으로 대용량의 데이터들을 처리하는데 매우 편리함.

In [4]:
import pandas as pd
import numpy as np

## Pandas자료구조
기본적으로 정의되는 자료구조인 Series와 DataFrame을 사용

### 2-1.Seires

In [5]:
#Series 정의하기
obj = pd.Series([4,7,-5,3])
print(obj)#print생략가능

0    4
1    7
2   -5
3    3
dtype: int64


In [7]:
#Series의 값만 확인하기
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [8]:
#Series의 인덱스만 확인하기
obj.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
#Series의 자료형 확인하기
obj.dtypes

dtype('int64')

In [18]:
#인덱스를 바꿀 수 있다.
obj2=pd.Series([4,7,-5,3],index=['d','b','a',3])
print(obj2)
print(obj2.index)

d    4
b    7
a   -5
3    3
dtype: int64
Index(['d', 'b', 'a', 3], dtype='object')


In [21]:
#python의 dictionary 자료형을 Series Data로 만들 수 있다.
#dic의 key가 Series의 indx가 된다.
ssdata=[1,2,3,4,5,6]
obj4=pd.Series(ssdata)
print(obj4)
sdata={'kim':35000,'lee':47000,'park':52000,'jung':18000}
obj3=pd.Series(sdata)
print(obj3)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64
kim     35000
lee     47000
park    52000
jung    18000
dtype: int64


In [29]:
#index 변경
obj3.index=['A','B','C','D']
print(obj3)

A    35000
B    47000
C    52000
D    18000
Name: Salary, dtype: int64


In [30]:
obj3.name='Salary'
obj3.index.name='Names'
print(obj3)

Names
A    35000
B    47000
C    52000
D    18000
Name: Salary, dtype: int64


In [31]:
#index 변경
obj3.index=['A','B','C','D']
print(obj3)
print(obj3.index)

A    35000
B    47000
C    52000
D    18000
Name: Salary, dtype: int64
Index(['A', 'B', 'C', 'D'], dtype='object')


## 2-2.DataFrame

In [36]:
#DataFrame정의하기
#그전에 DataFrame에 들어갈 데이터를 정의해주어야 하는데.
#이는 python의 dictionary 또는 numpy의 array로 정의할 수 있다.
data={'name':['geno', 'keno', 'zano', 'uuno', 'mukno','muhano']
      ,'year':[2020, 2021, 2022, 2018, 2014, 2017]
      ,'points':[3.4, 4.3, 2.8, 3.6, 1.8, 4.0]}
df=pd.DataFrame(data)
df

Unnamed: 0,name,year,points
0,geno,2020,3.4
1,keno,2021,4.3
2,zano,2022,2.8
3,uuno,2018,3.6
4,mukno,2014,1.8
5,muhano,2017,4.0


### 행과 열의 구조를 가진 데이터가 생긴다.

In [38]:
#행 방향의 index
df.index

RangeIndex(start=0, stop=6, step=1)

In [39]:
#열 방향의 index
df.columns

Index(['name', 'year', 'points'], dtype='object')

In [40]:
#값 얻기
df.values

array([['geno', 2020, 3.4],
       ['keno', 2021, 4.3],
       ['zano', 2022, 2.8],
       ['uuno', 2018, 3.6],
       ['mukno', 2014, 1.8],
       ['muhano', 2017, 4.0]], dtype=object)

In [41]:
#각 인덱스에 대한 이름 설정하기
df.index.name="Num"
df.columns.name='info'
df

info,name,year,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,geno,2020,3.4
1,keno,2021,4.3
2,zano,2022,2.8
3,uuno,2018,3.6
4,mukno,2014,1.8
5,muhano,2017,4.0


In [48]:
#DataFrame을 만들면서 columns와 index를 설정할수 있다.
df2=pd.DataFrame(data, columns=['year', 'name', 'points', 'penalty']
                 ,index=['one', 'two', 'three', 'four', 'five', 'six'])
df2

Unnamed: 0,year,name,points,penalty
one,2020,geno,3.4,
two,2021,keno,4.3,
three,2022,zano,2.8,
four,2018,uuno,3.6,
five,2014,mukno,1.8,
six,2017,muhano,4.0,


### DataFrame을 정의하면서 data로 들어가는 dic과 columns의 순서가 달라도 알아서 정의
하지만 data에 포함되어 있지 않은 값은 NaN으로 나타남

NaN은 추후에 어떠한 방법으로도 처리가 되지않는 데이터. 

따라서 올바른 데이터를 위해 추가적으로 값을 넣어줘야한다.

In [56]:
#describe() 함수는 DataFrame의 계산 가능한 값들에 대한 다양한 값을 보여준다.
df2.describe()

Unnamed: 0,year,points
count,6.0,6.0
mean,2018.666667,3.316667
std,2.94392,0.904249
min,2014.0,1.8
25%,2017.25,2.95
50%,2019.0,3.5
75%,2020.75,3.9
max,2022.0,4.3


### 3.DataFrame Indexing

In [171]:
data2={"names" : ["kilho","chaeho","hyeonho","gaho","junho"]
      ,"year":[1994,1993,1994,1992,1993]
      ,"points":[1.5, 1.7, 3.6, 2.4, 2.9]}
df3=pd.DataFrame(data2, columns=["year", "names", "points", "penalty"]
                ,index=["one", "two", "three", "four", "five"])
df3

Unnamed: 0,year,names,points,penalty
one,1994,kilho,1.5,
two,1993,chaeho,1.7,
three,1994,hyeonho,3.6,
four,1992,gaho,2.4,
five,1993,junho,2.9,


#### 3-1.DataFrame에서 열을 선택하고 조작하기

In [172]:
df3["year"]

one      1994
two      1993
three    1994
four     1992
five     1993
Name: year, dtype: int64

In [173]:
#동일한 의미를갖는 명령어.
df3.year

one      1994
two      1993
three    1994
four     1992
five     1993
Name: year, dtype: int64

In [174]:
df3[['year',"points"]]

Unnamed: 0,year,points
one,1994,1.5
two,1993,1.7
three,1994,3.6
four,1992,2.4
five,1993,2.9


In [175]:
#특정열에 대해 위와 같이 선택하고, 우리가 원하는 값을 대입할 수 있다.
df3["penalty"]=0.5
df3

Unnamed: 0,year,names,points,penalty
one,1994,kilho,1.5,0.5
two,1993,chaeho,1.7,0.5
three,1994,hyeonho,3.6,0.5
four,1992,gaho,2.4,0.5
five,1993,junho,2.9,0.5


In [176]:
#각 자리에도 넣을 수 있다.
df3['penalty']=[0.1, 0.2, 0.3, 0.4, 0.5]#python의 list나 numpy의 array로도 가능
print(df3)

       year    names  points  penalty
one    1994    kilho     1.5      0.1
two    1993   chaeho     1.7      0.2
three  1994  hyeonho     3.6      0.3
four   1992     gaho     2.4      0.4
five   1993    junho     2.9      0.5


In [177]:
#새로운 열을 추가하기
df3['zeros']=np.arange(5)
df3

Unnamed: 0,year,names,points,penalty,zeros
one,1994,kilho,1.5,0.1,0
two,1993,chaeho,1.7,0.2,1
three,1994,hyeonho,3.6,0.3,2
four,1992,gaho,2.4,0.4,3
five,1993,junho,2.9,0.5,4


In [178]:
#Series를 추가할 수도 있다.
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df3['debt']=val
df3

Unnamed: 0,year,names,points,penalty,zeros,debt
one,1994,kilho,1.5,0.1,0,
two,1993,chaeho,1.7,0.2,1,-1.2
three,1994,hyeonho,3.6,0.3,2,
four,1992,gaho,2.4,0.4,3,-1.5
five,1993,junho,2.9,0.5,4,-1.7


#### 하지만 Series로 넣을 때는 val와 같이 넣으려는 data의 index에 맞춰서 데이터가 들어간다.
이점이 python의 list나 numpy의 array로 데이터를 넣을때와 가장 큰 차이점이다.

In [179]:
df3['net_points']=df3['points']-df3['penalty']
df3['hight_points']=df3['net_points']>2.0
df3.loc['six',:] = [2013,"mino",4.0,0.1,5,0.8,0,0]
df3

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,hight_points
one,1994.0,kilho,1.5,0.1,0.0,,1.4,False
two,1993.0,chaeho,1.7,0.2,1.0,-1.2,1.5,False
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True
four,1992.0,gaho,2.4,0.4,3.0,-1.5,2.0,False
five,1993.0,junho,2.9,0.5,4.0,-1.7,2.4,True
six,2013.0,mino,4.0,0.1,5.0,0.8,0.0,0


#### 3-2.DataFrame에서 행을 선택하고 조작하기
pandas에서는 DataFrame에서 행을 인덱싱하는 방법이 무수히많다. 열 또한 그러하다

In [180]:
#0번째 부터 2(3-1)번째 까지 가져온다.
#뒤에 써준 숫자번째의 행은 뺀다.
df3[0:3]

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,hight_points
one,1994.0,kilho,1.5,0.1,0.0,,1.4,False
two,1993.0,chaeho,1.7,0.2,1.0,-1.2,1.5,False
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True


In [181]:
#two~four까지 가져온다.
#뒤에 써준 이름의 행을 빼지 않는다.
df3['two':'four'] #비추

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,hight_points
two,1993.0,chaeho,1.7,0.2,1.0,-1.2,1.5,False
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True
four,1992.0,gaho,2.4,0.4,3.0,-1.5,2.0,False


In [182]:
#아래 방법을 권장한다.
#.loc 또는 .ilco 함수를 사용하는 방법.
df3.loc['two'] #반환 형태는 Series

year            1993.0
names           chaeho
points             1.7
penalty            0.2
zeros              1.0
debt              -1.2
net_points         1.5
hight_points     False
Name: two, dtype: object

In [183]:
df3.loc['two':'four']

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,hight_points
two,1993.0,chaeho,1.7,0.2,1.0,-1.2,1.5,False
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True
four,1992.0,gaho,2.4,0.4,3.0,-1.5,2.0,False


In [184]:
df3.loc['two':'four','points']

two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [185]:
df3.loc[:,'year'] #==df3['year']

one      1994.0
two      1993.0
three    1994.0
four     1992.0
five     1993.0
six      2013.0
Name: year, dtype: float64

In [186]:
df3.loc[:,['year', 'names']]#0부터 끝까지.

Unnamed: 0,year,names
one,1994.0,kilho
two,1993.0,chaeho
three,1994.0,hyeonho
four,1992.0,gaho
five,1993.0,junho
six,2013.0,mino


In [187]:
df3.loc['three':'five','year':'penalty']

Unnamed: 0,year,names,points,penalty
three,1994.0,hyeonho,3.6,0.3
four,1992.0,gaho,2.4,0.4
five,1993.0,junho,2.9,0.5


In [188]:
df3.loc['two','names']="miyao"
df3

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,hight_points
one,1994.0,kilho,1.5,0.1,0.0,,1.4,False
two,1993.0,miyao,1.7,0.2,1.0,-1.2,1.5,False
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True
four,1992.0,gaho,2.4,0.4,3.0,-1.5,2.0,False
five,1993.0,junho,2.9,0.5,4.0,-1.7,2.4,True
six,2013.0,mino,4.0,0.1,5.0,0.8,0.0,0


In [189]:
df3.index.name='Order'
df3.columns.name="info"

In [227]:
#새로운 행 삽입하기
df3.loc['six',:] = [2013,"mino",4.0,0.1,5,0.8, np.nan, np.nan] #기존의 columns에 해당하는 모든 값을 얺어주어야함
df3###교수님챈스@@

info,year,names,points,penalty,zeros,debt,net_points,hight_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
one,1994.0,kilho,1.5,0.1,0.0,,1.4,False
two,1993.0,miyao,1.7,0.2,1.0,-1.2,1.5,False
three,1994.0,hyeonho,3.6,0.0,2.0,,3.3,True
four,1992.0,gaho,2.4,0.4,3.0,-1.5,2.0,False
five,1993.0,junho,2.9,0.5,4.0,-1.7,2.4,True
six,2013.0,mino,4.0,0.1,5.0,0.8,,


In [191]:
df3#위에 있던 net_points와 hight_points에 대한 로직을 실행 시켜주어야 정상 작동.

info,year,names,points,penalty,zeros,debt,net_points,hight_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
one,1994.0,kilho,1.5,0.1,0.0,,1.4,False
two,1993.0,miyao,1.7,0.2,1.0,-1.2,1.5,False
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True
four,1992.0,gaho,2.4,0.4,3.0,-1.5,2.0,False
five,1993.0,junho,2.9,0.5,4.0,-1.7,2.4,True
six,2013.0,mino,4.0,0.1,5.0,0.8,0.0,0


In [192]:
#.iloc 사용:index번호를 사용한다.
df3.iloc[3]#3번째 행을 가져온다.

info
year            1992.0
names             gaho
points             2.4
penalty            0.4
zeros              3.0
debt              -1.5
net_points         2.0
hight_points     False
Name: four, dtype: object

In [193]:
df3.iloc[3:5,0:5]

info,year,names,points,penalty,zeros
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
four,1992.0,gaho,2.4,0.4,3.0
five,1993.0,junho,2.9,0.5,4.0


In [194]:
df3.iloc[[0,1,3],[1,2]]

info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,kilho,1.5
two,miyao,1.7
four,gaho,2.4


In [195]:
df3.iloc[[2,4,5],[1,3,5]]

info,names,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
three,hyeonho,0.3,
five,junho,0.5,-1.7
six,mino,0.1,0.8


In [196]:
df3.iloc[1,1]

'miyao'

### DataFrame에서의 boolean indexing

In [197]:
df3

info,year,names,points,penalty,zeros,debt,net_points,hight_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
one,1994.0,kilho,1.5,0.1,0.0,,1.4,False
two,1993.0,miyao,1.7,0.2,1.0,-1.2,1.5,False
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True
four,1992.0,gaho,2.4,0.4,3.0,-1.5,2.0,False
five,1993.0,junho,2.9,0.5,4.0,-1.7,2.4,True
six,2013.0,mino,4.0,0.1,5.0,0.8,0.0,0


In [198]:
#year가 1993보다 큰 boolean data
df3['net_points'] > 2

Order
one      False
two      False
three     True
four     False
five      True
six      False
Name: net_points, dtype: bool

In [199]:
df3['year']>1993

Order
one       True
two      False
three     True
four     False
five     False
six       True
Name: year, dtype: bool

In [202]:
df3.loc[df3['net_points']>2,:]

info,year,names,points,penalty,zeros,debt,net_points,hight_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True
five,1993.0,junho,2.9,0.5,4.0,-1.7,2.4,True


In [205]:
df3.loc[df3['year']>1993,:]

info,year,names,points,penalty,zeros,debt,net_points,hight_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
one,1994.0,kilho,1.5,0.1,0.0,,1.4,False
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True
six,2013.0,mino,4.0,0.1,5.0,0.8,0.0,0


In [211]:
df3.loc[df3['names']=='mino',['names','points','penalty']]

info,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
six,mino,4.0,0.1


In [218]:
#numpy에서와 같이 논리연산을 응용할 수 잇다.
df3.loc[(df3['points']>2)&(df3['points']<4.0),:]

info,year,names,points,penalty,zeros,debt,net_points,hight_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
three,1994.0,hyeonho,3.6,0.3,2.0,,3.3,True
four,1992.0,gaho,2.4,0.4,3.0,-1.5,2.0,False
five,1993.0,junho,2.9,0.5,4.0,-1.7,2.4,True


In [220]:
#새로운 값을 대입할 수도 있다.
df3.loc[df3['points']>3,'penalty']=0
df3

info,year,names,points,penalty,zeros,debt,net_points,hight_points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
one,1994.0,kilho,1.5,0.1,0.0,,1.4,False
two,1993.0,miyao,1.7,0.2,1.0,-1.2,1.5,False
three,1994.0,hyeonho,3.6,0.0,2.0,,3.3,True
four,1992.0,gaho,2.4,0.4,3.0,-1.5,2.0,False
five,1993.0,junho,2.9,0.5,4.0,-1.7,2.4,True
six,2013.0,mino,4.0,0.0,5.0,0.8,0.0,0


### Data

In [224]:
#DataFrame을 만들때 index, column을 설정하지 않으면 기본값으로 0부터 시작하는 정수형 숫자로 입력됨
df4=pd.DataFrame(np.random.randn(6,4))
df4

Unnamed: 0,0,1,2,3
0,-0.549252,-1.564904,0.264608,-0.67136
1,-1.192618,1.410528,-0.839414,-0.550875
2,-0.50444,-0.363892,1.272906,0.245499
3,-1.854682,-0.027232,-0.702885,0.365385
4,0.102614,-0.112353,0.55625,0.179195
5,0.079691,2.497795,-1.27946,0.767114


In [225]:
df4.columns=['A','B','C','D']
df4.index=pd.date_range('20160701',periods=6)
#pandas함수data_range() : datatime자료형으로 구성된 날짜 시각 등을 표기할수있는 자료형을 만드는 함수.
print(df4.index)
df4

DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',
               '2016-07-05', '2016-07-06'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,A,B,C,D
2016-07-01,-0.549252,-1.564904,0.264608,-0.67136
2016-07-02,-1.192618,1.410528,-0.839414,-0.550875
2016-07-03,-0.50444,-0.363892,1.272906,0.245499
2016-07-04,-1.854682,-0.027232,-0.702885,0.365385
2016-07-05,0.102614,-0.112353,0.55625,0.179195
2016-07-06,0.079691,2.497795,-1.27946,0.767114


In [226]:
#np.nan은 NaN값을 의미한다.
df4['F']=[1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df4

Unnamed: 0,A,B,C,D,F
2016-07-01,-0.549252,-1.564904,0.264608,-0.67136,1.0
2016-07-02,-1.192618,1.410528,-0.839414,-0.550875,
2016-07-03,-0.50444,-0.363892,1.272906,0.245499,3.5
2016-07-04,-1.854682,-0.027232,-0.702885,0.365385,6.1
2016-07-05,0.102614,-0.112353,0.55625,0.179195,
2016-07-06,0.079691,2.497795,-1.27946,0.767114,7.0


In [235]:
#행의 값중 하나라도 nan인 경우 그 행을 없앤다. 
df5=df4.dropna(how='any')
##drop()는 반환해주는거라 기존의 DataFrame은 그대로 남아있다.
df4
#inplace=True

Unnamed: 0,A,B,C,D,F
2016-07-01,-0.549252,-1.564904,0.264608,-0.67136,1.0
2016-07-02,-1.192618,1.410528,-0.839414,-0.550875,
2016-07-03,-0.50444,-0.363892,1.272906,0.245499,3.5
2016-07-04,-1.854682,-0.027232,-0.702885,0.365385,6.1
2016-07-05,0.102614,-0.112353,0.55625,0.179195,
2016-07-06,0.079691,2.497795,-1.27946,0.767114,7.0


In [234]:
df5

Unnamed: 0,A,B,C,D,F
2016-07-01,-0.549252,-1.564904,0.264608,-0.67136,1.0
2016-07-03,-0.50444,-0.363892,1.272906,0.245499,3.5
2016-07-04,-1.854682,-0.027232,-0.702885,0.365385,6.1
2016-07-06,0.079691,2.497795,-1.27946,0.767114,7.0


In [230]:
#행의 값의 모든 값이 nan인 경우 그 행을 없앤다.
df4.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2016-07-01,-0.549252,-1.564904,0.264608,-0.67136,1.0
2016-07-02,-1.192618,1.410528,-0.839414,-0.550875,
2016-07-03,-0.50444,-0.363892,1.272906,0.245499,3.5
2016-07-04,-1.854682,-0.027232,-0.702885,0.365385,6.1
2016-07-05,0.102614,-0.112353,0.55625,0.179195,
2016-07-06,0.079691,2.497795,-1.27946,0.767114,7.0
